easy-cs-rec-custommodel 0.8.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of easy-cs-rec-custommodel might be problematic. Click here for more details.

Files changed (336) hide show
  1. easy_cs_rec_custommodel-0.8.6.dist-info/LICENSE +203 -0
  2. easy_cs_rec_custommodel-0.8.6.dist-info/METADATA +48 -0
  3. easy_cs_rec_custommodel-0.8.6.dist-info/RECORD +336 -0
  4. easy_cs_rec_custommodel-0.8.6.dist-info/WHEEL +6 -0
  5. easy_cs_rec_custommodel-0.8.6.dist-info/top_level.txt +2 -0
  6. easy_rec/__init__.py +114 -0
  7. easy_rec/python/__init__.py +0 -0
  8. easy_rec/python/builders/__init__.py +0 -0
  9. easy_rec/python/builders/hyperparams_builder.py +78 -0
  10. easy_rec/python/builders/loss_builder.py +333 -0
  11. easy_rec/python/builders/optimizer_builder.py +211 -0
  12. easy_rec/python/builders/strategy_builder.py +44 -0
  13. easy_rec/python/compat/__init__.py +0 -0
  14. easy_rec/python/compat/adam_s.py +245 -0
  15. easy_rec/python/compat/array_ops.py +229 -0
  16. easy_rec/python/compat/dynamic_variable.py +542 -0
  17. easy_rec/python/compat/early_stopping.py +653 -0
  18. easy_rec/python/compat/embedding_ops.py +162 -0
  19. easy_rec/python/compat/embedding_parallel_saver.py +316 -0
  20. easy_rec/python/compat/estimator_train.py +116 -0
  21. easy_rec/python/compat/exporter.py +473 -0
  22. easy_rec/python/compat/feature_column/__init__.py +0 -0
  23. easy_rec/python/compat/feature_column/feature_column.py +3675 -0
  24. easy_rec/python/compat/feature_column/feature_column_v2.py +5233 -0
  25. easy_rec/python/compat/feature_column/sequence_feature_column.py +648 -0
  26. easy_rec/python/compat/feature_column/utils.py +154 -0
  27. easy_rec/python/compat/layers.py +329 -0
  28. easy_rec/python/compat/ops.py +14 -0
  29. easy_rec/python/compat/optimizers.py +619 -0
  30. easy_rec/python/compat/queues.py +311 -0
  31. easy_rec/python/compat/regularizers.py +208 -0
  32. easy_rec/python/compat/sok_optimizer.py +440 -0
  33. easy_rec/python/compat/sync_replicas_optimizer.py +528 -0
  34. easy_rec/python/compat/weight_decay_optimizers.py +475 -0
  35. easy_rec/python/core/__init__.py +0 -0
  36. easy_rec/python/core/easyrec_metrics/__init__.py +24 -0
  37. easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_pai.py +3702 -0
  38. easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_tf.py +3768 -0
  39. easy_rec/python/core/learning_schedules.py +228 -0
  40. easy_rec/python/core/metrics.py +402 -0
  41. easy_rec/python/core/sampler.py +844 -0
  42. easy_rec/python/eval.py +102 -0
  43. easy_rec/python/export.py +150 -0
  44. easy_rec/python/feature_column/__init__.py +0 -0
  45. easy_rec/python/feature_column/feature_column.py +664 -0
  46. easy_rec/python/feature_column/feature_group.py +89 -0
  47. easy_rec/python/hpo/__init__.py +0 -0
  48. easy_rec/python/hpo/emr_hpo.py +140 -0
  49. easy_rec/python/hpo/generate_hpo_sql.py +71 -0
  50. easy_rec/python/hpo/pai_hpo.py +297 -0
  51. easy_rec/python/inference/__init__.py +0 -0
  52. easy_rec/python/inference/csv_predictor.py +189 -0
  53. easy_rec/python/inference/hive_parquet_predictor.py +200 -0
  54. easy_rec/python/inference/hive_predictor.py +166 -0
  55. easy_rec/python/inference/odps_predictor.py +70 -0
  56. easy_rec/python/inference/parquet_predictor.py +147 -0
  57. easy_rec/python/inference/parquet_predictor_v2.py +147 -0
  58. easy_rec/python/inference/predictor.py +621 -0
  59. easy_rec/python/inference/processor/__init__.py +0 -0
  60. easy_rec/python/inference/processor/test.py +170 -0
  61. easy_rec/python/inference/vector_retrieve.py +124 -0
  62. easy_rec/python/input/__init__.py +0 -0
  63. easy_rec/python/input/batch_tfrecord_input.py +117 -0
  64. easy_rec/python/input/criteo_binary_reader.py +259 -0
  65. easy_rec/python/input/criteo_input.py +107 -0
  66. easy_rec/python/input/csv_input.py +175 -0
  67. easy_rec/python/input/csv_input_ex.py +72 -0
  68. easy_rec/python/input/csv_input_v2.py +68 -0
  69. easy_rec/python/input/datahub_input.py +320 -0
  70. easy_rec/python/input/dummy_input.py +58 -0
  71. easy_rec/python/input/hive_input.py +123 -0
  72. easy_rec/python/input/hive_parquet_input.py +140 -0
  73. easy_rec/python/input/hive_rtp_input.py +174 -0
  74. easy_rec/python/input/input.py +1064 -0
  75. easy_rec/python/input/kafka_dataset.py +144 -0
  76. easy_rec/python/input/kafka_input.py +235 -0
  77. easy_rec/python/input/load_parquet.py +317 -0
  78. easy_rec/python/input/odps_input.py +101 -0
  79. easy_rec/python/input/odps_input_v2.py +110 -0
  80. easy_rec/python/input/odps_input_v3.py +132 -0
  81. easy_rec/python/input/odps_rtp_input.py +187 -0
  82. easy_rec/python/input/odps_rtp_input_v2.py +104 -0
  83. easy_rec/python/input/parquet_input.py +397 -0
  84. easy_rec/python/input/parquet_input_v2.py +180 -0
  85. easy_rec/python/input/parquet_input_v3.py +203 -0
  86. easy_rec/python/input/rtp_input.py +225 -0
  87. easy_rec/python/input/rtp_input_v2.py +145 -0
  88. easy_rec/python/input/tfrecord_input.py +100 -0
  89. easy_rec/python/layers/__init__.py +0 -0
  90. easy_rec/python/layers/backbone.py +571 -0
  91. easy_rec/python/layers/capsule_layer.py +176 -0
  92. easy_rec/python/layers/cmbf.py +390 -0
  93. easy_rec/python/layers/common_layers.py +192 -0
  94. easy_rec/python/layers/dnn.py +87 -0
  95. easy_rec/python/layers/embed_input_layer.py +25 -0
  96. easy_rec/python/layers/fm.py +26 -0
  97. easy_rec/python/layers/input_layer.py +396 -0
  98. easy_rec/python/layers/keras/__init__.py +34 -0
  99. easy_rec/python/layers/keras/activation.py +114 -0
  100. easy_rec/python/layers/keras/attention.py +267 -0
  101. easy_rec/python/layers/keras/auxiliary_loss.py +47 -0
  102. easy_rec/python/layers/keras/blocks.py +262 -0
  103. easy_rec/python/layers/keras/bst.py +119 -0
  104. easy_rec/python/layers/keras/custom_ops.py +250 -0
  105. easy_rec/python/layers/keras/data_augment.py +133 -0
  106. easy_rec/python/layers/keras/din.py +67 -0
  107. easy_rec/python/layers/keras/einsum_dense.py +598 -0
  108. easy_rec/python/layers/keras/embedding.py +81 -0
  109. easy_rec/python/layers/keras/fibinet.py +251 -0
  110. easy_rec/python/layers/keras/interaction.py +416 -0
  111. easy_rec/python/layers/keras/layer_norm.py +364 -0
  112. easy_rec/python/layers/keras/mask_net.py +166 -0
  113. easy_rec/python/layers/keras/multi_head_attention.py +717 -0
  114. easy_rec/python/layers/keras/multi_task.py +125 -0
  115. easy_rec/python/layers/keras/numerical_embedding.py +376 -0
  116. easy_rec/python/layers/keras/ppnet.py +194 -0
  117. easy_rec/python/layers/keras/transformer.py +192 -0
  118. easy_rec/python/layers/layer_norm.py +51 -0
  119. easy_rec/python/layers/mmoe.py +83 -0
  120. easy_rec/python/layers/multihead_attention.py +162 -0
  121. easy_rec/python/layers/multihead_cross_attention.py +749 -0
  122. easy_rec/python/layers/senet.py +73 -0
  123. easy_rec/python/layers/seq_input_layer.py +134 -0
  124. easy_rec/python/layers/sequence_feature_layer.py +249 -0
  125. easy_rec/python/layers/uniter.py +301 -0
  126. easy_rec/python/layers/utils.py +248 -0
  127. easy_rec/python/layers/variational_dropout_layer.py +130 -0
  128. easy_rec/python/loss/__init__.py +0 -0
  129. easy_rec/python/loss/circle_loss.py +82 -0
  130. easy_rec/python/loss/contrastive_loss.py +79 -0
  131. easy_rec/python/loss/f1_reweight_loss.py +38 -0
  132. easy_rec/python/loss/focal_loss.py +93 -0
  133. easy_rec/python/loss/jrc_loss.py +128 -0
  134. easy_rec/python/loss/listwise_loss.py +161 -0
  135. easy_rec/python/loss/multi_similarity.py +68 -0
  136. easy_rec/python/loss/pairwise_loss.py +307 -0
  137. easy_rec/python/loss/softmax_loss_with_negative_mining.py +110 -0
  138. easy_rec/python/loss/zero_inflated_lognormal.py +76 -0
  139. easy_rec/python/main.py +878 -0
  140. easy_rec/python/model/__init__.py +0 -0
  141. easy_rec/python/model/autoint.py +73 -0
  142. easy_rec/python/model/cmbf.py +47 -0
  143. easy_rec/python/model/collaborative_metric_learning.py +182 -0
  144. easy_rec/python/model/custom_model.py +323 -0
  145. easy_rec/python/model/dat.py +138 -0
  146. easy_rec/python/model/dbmtl.py +116 -0
  147. easy_rec/python/model/dcn.py +70 -0
  148. easy_rec/python/model/deepfm.py +106 -0
  149. easy_rec/python/model/dlrm.py +73 -0
  150. easy_rec/python/model/dropoutnet.py +207 -0
  151. easy_rec/python/model/dssm.py +154 -0
  152. easy_rec/python/model/dssm_senet.py +143 -0
  153. easy_rec/python/model/dummy_model.py +48 -0
  154. easy_rec/python/model/easy_rec_estimator.py +739 -0
  155. easy_rec/python/model/easy_rec_model.py +467 -0
  156. easy_rec/python/model/esmm.py +242 -0
  157. easy_rec/python/model/fm.py +63 -0
  158. easy_rec/python/model/match_model.py +357 -0
  159. easy_rec/python/model/mind.py +445 -0
  160. easy_rec/python/model/mmoe.py +70 -0
  161. easy_rec/python/model/multi_task_model.py +303 -0
  162. easy_rec/python/model/multi_tower.py +62 -0
  163. easy_rec/python/model/multi_tower_bst.py +190 -0
  164. easy_rec/python/model/multi_tower_din.py +130 -0
  165. easy_rec/python/model/multi_tower_recall.py +68 -0
  166. easy_rec/python/model/pdn.py +203 -0
  167. easy_rec/python/model/ple.py +120 -0
  168. easy_rec/python/model/rank_model.py +485 -0
  169. easy_rec/python/model/rocket_launching.py +203 -0
  170. easy_rec/python/model/simple_multi_task.py +54 -0
  171. easy_rec/python/model/uniter.py +46 -0
  172. easy_rec/python/model/wide_and_deep.py +121 -0
  173. easy_rec/python/ops/1.12/incr_record.so +0 -0
  174. easy_rec/python/ops/1.12/kafka.so +0 -0
  175. easy_rec/python/ops/1.12/libcustom_ops.so +0 -0
  176. easy_rec/python/ops/1.12/libembed_op.so +0 -0
  177. easy_rec/python/ops/1.12/libhiredis.so.1.0.0 +0 -0
  178. easy_rec/python/ops/1.12/librdkafka++.so.1 +0 -0
  179. easy_rec/python/ops/1.12/librdkafka.so.1 +0 -0
  180. easy_rec/python/ops/1.12/libredis++.so +0 -0
  181. easy_rec/python/ops/1.12/libredis++.so.1 +0 -0
  182. easy_rec/python/ops/1.12/libredis++.so.1.2.3 +0 -0
  183. easy_rec/python/ops/1.12/libstr_avx_op.so +0 -0
  184. easy_rec/python/ops/1.12/libwrite_sparse_kv.so +0 -0
  185. easy_rec/python/ops/1.15/incr_record.so +0 -0
  186. easy_rec/python/ops/1.15/kafka.so +0 -0
  187. easy_rec/python/ops/1.15/libcustom_ops.so +0 -0
  188. easy_rec/python/ops/1.15/libembed_op.so +0 -0
  189. easy_rec/python/ops/1.15/libhiredis.so.1.0.0 +0 -0
  190. easy_rec/python/ops/1.15/librdkafka++.so +0 -0
  191. easy_rec/python/ops/1.15/librdkafka++.so.1 +0 -0
  192. easy_rec/python/ops/1.15/librdkafka.so +0 -0
  193. easy_rec/python/ops/1.15/librdkafka.so.1 +0 -0
  194. easy_rec/python/ops/1.15/libredis++.so.1 +0 -0
  195. easy_rec/python/ops/1.15/libstr_avx_op.so +0 -0
  196. easy_rec/python/ops/2.12/libcustom_ops.so +0 -0
  197. easy_rec/python/ops/2.12/libload_embed.so +0 -0
  198. easy_rec/python/ops/2.12/libstr_avx_op.so +0 -0
  199. easy_rec/python/ops/__init__.py +0 -0
  200. easy_rec/python/ops/gen_kafka_ops.py +193 -0
  201. easy_rec/python/ops/gen_str_avx_op.py +28 -0
  202. easy_rec/python/ops/incr_record.py +30 -0
  203. easy_rec/python/predict.py +170 -0
  204. easy_rec/python/protos/__init__.py +0 -0
  205. easy_rec/python/protos/autoint_pb2.py +122 -0
  206. easy_rec/python/protos/backbone_pb2.py +1416 -0
  207. easy_rec/python/protos/cmbf_pb2.py +435 -0
  208. easy_rec/python/protos/collaborative_metric_learning_pb2.py +252 -0
  209. easy_rec/python/protos/custom_model_pb2.py +57 -0
  210. easy_rec/python/protos/dat_pb2.py +262 -0
  211. easy_rec/python/protos/data_source_pb2.py +422 -0
  212. easy_rec/python/protos/dataset_pb2.py +1920 -0
  213. easy_rec/python/protos/dbmtl_pb2.py +191 -0
  214. easy_rec/python/protos/dcn_pb2.py +197 -0
  215. easy_rec/python/protos/deepfm_pb2.py +163 -0
  216. easy_rec/python/protos/dlrm_pb2.py +163 -0
  217. easy_rec/python/protos/dnn_pb2.py +329 -0
  218. easy_rec/python/protos/dropoutnet_pb2.py +239 -0
  219. easy_rec/python/protos/dssm_pb2.py +262 -0
  220. easy_rec/python/protos/dssm_senet_pb2.py +282 -0
  221. easy_rec/python/protos/easy_rec_model_pb2.py +1672 -0
  222. easy_rec/python/protos/esmm_pb2.py +133 -0
  223. easy_rec/python/protos/eval_pb2.py +930 -0
  224. easy_rec/python/protos/export_pb2.py +379 -0
  225. easy_rec/python/protos/feature_config_pb2.py +1359 -0
  226. easy_rec/python/protos/fm_pb2.py +90 -0
  227. easy_rec/python/protos/hive_config_pb2.py +138 -0
  228. easy_rec/python/protos/hyperparams_pb2.py +624 -0
  229. easy_rec/python/protos/keras_layer_pb2.py +692 -0
  230. easy_rec/python/protos/layer_pb2.py +1936 -0
  231. easy_rec/python/protos/loss_pb2.py +1713 -0
  232. easy_rec/python/protos/mind_pb2.py +497 -0
  233. easy_rec/python/protos/mmoe_pb2.py +215 -0
  234. easy_rec/python/protos/multi_tower_pb2.py +295 -0
  235. easy_rec/python/protos/multi_tower_recall_pb2.py +198 -0
  236. easy_rec/python/protos/optimizer_pb2.py +2017 -0
  237. easy_rec/python/protos/pdn_pb2.py +293 -0
  238. easy_rec/python/protos/pipeline_pb2.py +516 -0
  239. easy_rec/python/protos/ple_pb2.py +231 -0
  240. easy_rec/python/protos/predict_pb2.py +1140 -0
  241. easy_rec/python/protos/rocket_launching_pb2.py +169 -0
  242. easy_rec/python/protos/seq_encoder_pb2.py +1084 -0
  243. easy_rec/python/protos/simi_pb2.py +54 -0
  244. easy_rec/python/protos/simple_multi_task_pb2.py +97 -0
  245. easy_rec/python/protos/tf_predict_pb2.py +630 -0
  246. easy_rec/python/protos/tower_pb2.py +661 -0
  247. easy_rec/python/protos/train_pb2.py +1197 -0
  248. easy_rec/python/protos/uniter_pb2.py +307 -0
  249. easy_rec/python/protos/variational_dropout_pb2.py +91 -0
  250. easy_rec/python/protos/wide_and_deep_pb2.py +131 -0
  251. easy_rec/python/test/__init__.py +0 -0
  252. easy_rec/python/test/csv_input_test.py +340 -0
  253. easy_rec/python/test/custom_early_stop_func.py +19 -0
  254. easy_rec/python/test/dh_local_run.py +104 -0
  255. easy_rec/python/test/embed_test.py +155 -0
  256. easy_rec/python/test/emr_run.py +119 -0
  257. easy_rec/python/test/eval_metric_test.py +107 -0
  258. easy_rec/python/test/excel_convert_test.py +64 -0
  259. easy_rec/python/test/export_test.py +513 -0
  260. easy_rec/python/test/fg_test.py +70 -0
  261. easy_rec/python/test/hive_input_test.py +311 -0
  262. easy_rec/python/test/hpo_test.py +235 -0
  263. easy_rec/python/test/kafka_test.py +373 -0
  264. easy_rec/python/test/local_incr_test.py +122 -0
  265. easy_rec/python/test/loss_test.py +110 -0
  266. easy_rec/python/test/odps_command.py +61 -0
  267. easy_rec/python/test/odps_local_run.py +86 -0
  268. easy_rec/python/test/odps_run.py +254 -0
  269. easy_rec/python/test/odps_test_cls.py +39 -0
  270. easy_rec/python/test/odps_test_prepare.py +198 -0
  271. easy_rec/python/test/odps_test_util.py +237 -0
  272. easy_rec/python/test/pre_check_test.py +54 -0
  273. easy_rec/python/test/predictor_test.py +394 -0
  274. easy_rec/python/test/rtp_convert_test.py +133 -0
  275. easy_rec/python/test/run.py +138 -0
  276. easy_rec/python/test/train_eval_test.py +1299 -0
  277. easy_rec/python/test/util_test.py +85 -0
  278. easy_rec/python/test/zero_inflated_lognormal_test.py +53 -0
  279. easy_rec/python/tools/__init__.py +0 -0
  280. easy_rec/python/tools/add_boundaries_to_config.py +67 -0
  281. easy_rec/python/tools/add_feature_info_to_config.py +145 -0
  282. easy_rec/python/tools/convert_config_format.py +48 -0
  283. easy_rec/python/tools/convert_rtp_data.py +79 -0
  284. easy_rec/python/tools/convert_rtp_fg.py +106 -0
  285. easy_rec/python/tools/create_config_from_excel.py +427 -0
  286. easy_rec/python/tools/criteo/__init__.py +0 -0
  287. easy_rec/python/tools/criteo/convert_data.py +157 -0
  288. easy_rec/python/tools/edit_lookup_graph.py +134 -0
  289. easy_rec/python/tools/faiss_index_pai.py +116 -0
  290. easy_rec/python/tools/feature_selection.py +316 -0
  291. easy_rec/python/tools/hit_rate_ds.py +223 -0
  292. easy_rec/python/tools/hit_rate_pai.py +138 -0
  293. easy_rec/python/tools/pre_check.py +120 -0
  294. easy_rec/python/tools/predict_and_chk.py +111 -0
  295. easy_rec/python/tools/read_kafka.py +55 -0
  296. easy_rec/python/tools/split_model_pai.py +286 -0
  297. easy_rec/python/tools/split_pdn_model_pai.py +272 -0
  298. easy_rec/python/tools/test_saved_model.py +80 -0
  299. easy_rec/python/tools/view_saved_model.py +39 -0
  300. easy_rec/python/tools/write_kafka.py +65 -0
  301. easy_rec/python/train_eval.py +325 -0
  302. easy_rec/python/utils/__init__.py +15 -0
  303. easy_rec/python/utils/activation.py +120 -0
  304. easy_rec/python/utils/check_utils.py +87 -0
  305. easy_rec/python/utils/compat.py +14 -0
  306. easy_rec/python/utils/config_util.py +652 -0
  307. easy_rec/python/utils/constant.py +43 -0
  308. easy_rec/python/utils/convert_rtp_fg.py +616 -0
  309. easy_rec/python/utils/dag.py +192 -0
  310. easy_rec/python/utils/distribution_utils.py +268 -0
  311. easy_rec/python/utils/ds_util.py +65 -0
  312. easy_rec/python/utils/embedding_utils.py +73 -0
  313. easy_rec/python/utils/estimator_utils.py +1036 -0
  314. easy_rec/python/utils/export_big_model.py +630 -0
  315. easy_rec/python/utils/expr_util.py +118 -0
  316. easy_rec/python/utils/fg_util.py +53 -0
  317. easy_rec/python/utils/hit_rate_utils.py +220 -0
  318. easy_rec/python/utils/hive_utils.py +183 -0
  319. easy_rec/python/utils/hpo_util.py +137 -0
  320. easy_rec/python/utils/hvd_utils.py +56 -0
  321. easy_rec/python/utils/input_utils.py +108 -0
  322. easy_rec/python/utils/io_util.py +282 -0
  323. easy_rec/python/utils/load_class.py +249 -0
  324. easy_rec/python/utils/meta_graph_editor.py +941 -0
  325. easy_rec/python/utils/multi_optimizer.py +62 -0
  326. easy_rec/python/utils/numpy_utils.py +18 -0
  327. easy_rec/python/utils/odps_util.py +79 -0
  328. easy_rec/python/utils/pai_util.py +86 -0
  329. easy_rec/python/utils/proto_util.py +90 -0
  330. easy_rec/python/utils/restore_filter.py +89 -0
  331. easy_rec/python/utils/shape_utils.py +432 -0
  332. easy_rec/python/utils/static_shape.py +71 -0
  333. easy_rec/python/utils/test_utils.py +866 -0
  334. easy_rec/python/utils/tf_utils.py +56 -0
  335. easy_rec/version.py +4 -0
  336. test/__init__.py +0 -0
@@ -0,0 +1,3675 @@
1
+ # -*- encoding:utf-8 -*-
2
+ # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # ==============================================================================
16
+ """This API defines FeatureColumn abstraction.
17
+
18
+ FeatureColumns provide a high level abstraction for ingesting and representing
19
+ features. FeatureColumns are also the primary way of encoding features for
20
+ canned `tf.estimator.Estimator`s.
21
+
22
+ When using FeatureColumns with `Estimators`, the type of feature column you
23
+ should choose depends on (1) the feature type and (2) the model type.
24
+
25
+ 1. Feature type:
26
+
27
+ * Continuous features can be represented by `numeric_column`.
28
+ * Categorical features can be represented by any `categorical_column_with_*`
29
+ column:
30
+ - `categorical_column_with_vocabulary_list`
31
+ - `categorical_column_with_vocabulary_file`
32
+ - `categorical_column_with_hash_bucket`
33
+ - `categorical_column_with_identity`
34
+ - `weighted_categorical_column`
35
+
36
+ 2. Model type:
37
+
38
+ * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
39
+
40
+ Continuous features can be directly fed into deep neural network models.
41
+
42
+ age_column = numeric_column("age")
43
+
44
+ To feed sparse features into DNN models, wrap the column with
45
+ `embedding_column` or `indicator_column`. `indicator_column` is recommended
46
+ for features with only a few possible values. For features with many
47
+ possible values, to reduce the size of your model, `embedding_column` is
48
+ recommended.
49
+
50
+ embedded_dept_column = embedding_column(
51
+ categorical_column_with_vocabulary_list(
52
+ "department", ["math", "philosophy", ...]), dimension=10)
53
+
54
+ * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
55
+
56
+ Sparse features can be fed directly into linear models. They behave like an
57
+ indicator column but with an efficient implementation.
58
+
59
+ dept_column = categorical_column_with_vocabulary_list("department",
60
+ ["math", "philosophy", "english"])
61
+
62
+ It is recommended that continuous features be bucketized before being
63
+ fed into linear models.
64
+
65
+ bucketized_age_column = bucketized_column(
66
+ source_column=age_column,
67
+ boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
68
+
69
+ Sparse features can be crossed (also known as conjuncted or combined) in
70
+ order to form non-linearities, and then fed into linear models.
71
+
72
+ cross_dept_age_column = crossed_column(
73
+ columns=["department", bucketized_age_column],
74
+ hash_bucket_size=1000)
75
+
76
+ Example of building canned `Estimator`s using FeatureColumns:
77
+
78
+ ```python
79
+ # Define features and transformations
80
+ deep_feature_columns = [age_column, embedded_dept_column]
81
+ wide_feature_columns = [dept_column, bucketized_age_column,
82
+ cross_dept_age_column]
83
+
84
+ # Build deep model
85
+ estimator = DNNClassifier(
86
+ feature_columns=deep_feature_columns,
87
+ hidden_units=[500, 250, 50])
88
+ estimator.train(...)
89
+
90
+ # Or build a wide model
91
+ estimator = LinearClassifier(
92
+ feature_columns=wide_feature_columns)
93
+ estimator.train(...)
94
+
95
+ # Or build a wide and deep model!
96
+ estimator = DNNLinearCombinedClassifier(
97
+ linear_feature_columns=wide_feature_columns,
98
+ dnn_feature_columns=deep_feature_columns,
99
+ dnn_hidden_units=[500, 250, 50])
100
+ estimator.train(...)
101
+ ```
102
+
103
+
104
+ FeatureColumns can also be transformed into a generic input layer for
105
+ custom models using `input_layer`.
106
+
107
+ Example of building model using FeatureColumns, this can be used in a
108
+ `model_fn` which is given to the {tf.estimator.Estimator}:
109
+
110
+ ```python
111
+ # Building model via layers
112
+
113
+ deep_feature_columns = [age_column, embedded_dept_column]
114
+ columns_to_tensor = parse_feature_columns_from_examples(
115
+ serialized=my_data,
116
+ feature_columns=deep_feature_columns)
117
+ first_layer = input_layer(
118
+ features=columns_to_tensor,
119
+ feature_columns=deep_feature_columns)
120
+ second_layer = fully_connected(first_layer, ...)
121
+ ```
122
+
123
+ NOTE: Functions prefixed with "_" indicate experimental or private parts of
124
+ the API subject to change, and should not be relied upon!
125
+
126
+ NOTE: The new feature columns are being developed in feature_column_v2.py and
127
+ are a somewhat duplicate of the code here. Please make sure to update logic
128
+ in both places.
129
+ """
130
+
131
+ from __future__ import absolute_import
132
+ from __future__ import division
133
+ from __future__ import print_function
134
+
135
+ import abc
136
+ import collections
137
+ import math
138
+ import os
139
+
140
+ import numpy as np
141
+ import six
142
+ from tensorflow.python.eager import context
143
+ from tensorflow.python.framework import dtypes
144
+ from tensorflow.python.framework import ops
145
+ from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
146
+ from tensorflow.python.framework import tensor_shape
147
+ from tensorflow.python.keras.engine import training
148
+ from tensorflow.python.layers import base
149
+ # from tensorflow.python.ops import logging_ops
150
+ from tensorflow.python.ops import array_ops
151
+ from tensorflow.python.ops import check_ops
152
+ from tensorflow.python.ops import control_flow_ops
153
+ from tensorflow.python.ops import data_flow_ops
154
+ from tensorflow.python.ops import embedding_ops
155
+ from tensorflow.python.ops import init_ops
156
+ from tensorflow.python.ops import lookup_ops
157
+ from tensorflow.python.ops import math_ops
158
+ from tensorflow.python.ops import nn_ops
159
+ from tensorflow.python.ops import parsing_ops
160
+ from tensorflow.python.ops import resource_variable_ops
161
+ from tensorflow.python.ops import sparse_ops
162
+ from tensorflow.python.ops import string_ops
163
+ from tensorflow.python.ops import template
164
+ from tensorflow.python.ops import variable_scope
165
+ from tensorflow.python.ops import variables
166
+ # from tensorflow.python.ops.ragged import ragged_tensor
167
+ # from tensorflow.python.ops.ragged import ragged_util
168
+ from tensorflow.python.platform import gfile
169
+ from tensorflow.python.platform import tf_logging as logging
170
+ from tensorflow.python.training import checkpoint_utils
171
+ from tensorflow.python.util import nest
172
+
173
+ from easy_rec.python.compat.feature_column import utils as fc_utils
174
+ from easy_rec.python.utils import conditional
175
+ from easy_rec.python.utils import constant
176
+ from easy_rec.python.utils import embedding_utils
177
+
178
+ try:
179
+ from easy_rec.python.compat import dynamic_variable
180
+ except Exception:
181
+ dynamic_variable = None
182
+
183
+ try:
184
+ import horovod.tensorflow as hvd
185
+ except Exception:
186
+ hvd = None
187
+
188
+
189
+ def embedding_lookup_ragged(embedding_weights,
190
+ ragged_ids,
191
+ ragged_weights,
192
+ combiner,
193
+ max_norm=None,
194
+ name=None):
195
+ segment_ids = ragged_ids.value_rowids()
196
+ if segment_ids.dtype != dtypes.int32:
197
+ segment_ids = math_ops.cast(segment_ids, dtypes.int32)
198
+ ids = ragged_ids.flat_values
199
+ ids, idx = array_ops.unique(ids)
200
+ embeddings = embedding_ops.embedding_lookup(
201
+ embedding_weights, ids, partition_strategy='mod', max_norm=max_norm)
202
+ if ragged_weights is not None:
203
+ weights = ragged_weights.flat_values
204
+ embeddings = array_ops.gather(embeddings, idx)
205
+ original_dtype = embeddings.dtype
206
+ if embeddings.dtype in (dtypes.float16, dtypes.bfloat16):
207
+ # Cast low-precision embeddings to float32 during the computation to
208
+ # avoid numerical issues.
209
+ embeddings = math_ops.cast(embeddings, dtypes.float32)
210
+ if weights.dtype != embeddings.dtype:
211
+ weights = math_ops.cast(weights, embeddings.dtype)
212
+ weights = array_ops.expand_dims(weights, len(embeddings.get_shape()))
213
+ embeddings = embeddings * weights
214
+ if combiner == 'sum':
215
+ return math_ops.segment_sum(embeddings, segment_ids, name=name)
216
+ elif combiner == 'mean':
217
+ embeddings = math_ops.segment_sum(embeddings, segment_ids)
218
+ weight_sum = math_ops.segment_sum(weights, segment_ids)
219
+ embeddings = math_ops.div_no_nan(embeddings, weight_sum, name=name)
220
+ elif combiner == 'sqrtn':
221
+ embeddings = math_ops.segment_sum(embeddings, segment_ids)
222
+ weights_squared = math_ops.pow(weights, 2)
223
+ weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
224
+ weight_sum_sqrt = math_ops.sqrt(weight_sum)
225
+ embeddings = math_ops.div_no_nan(embeddings, weight_sum_sqrt, name=name)
226
+ else:
227
+ assert False, 'Unrecognized combiner'
228
+ if embeddings.dtype != original_dtype:
229
+ embeddings = math_ops.cast(embeddings, original_dtype)
230
+ return embeddings
231
+ else:
232
+ assert idx is not None
233
+ if combiner == 'sum':
234
+ embeddings = math_ops.sparse_segment_sum(
235
+ embeddings, idx, segment_ids, name=name)
236
+ elif combiner == 'mean':
237
+ embeddings = math_ops.sparse_segment_mean(
238
+ embeddings, idx, segment_ids, name=name)
239
+ elif combiner == 'sqrtn':
240
+ embeddings = math_ops.sparse_segment_sqrt_n(
241
+ embeddings, idx, segment_ids, name=name)
242
+ else:
243
+ assert False, 'Unrecognized combiner'
244
+ return embeddings
245
+
246
+
247
+ # model parallel embedding lookup
248
+ def embedding_parallel_lookup(embedding,
249
+ lookup_indices,
250
+ output_ids,
251
+ is_training,
252
+ output_tensors=None,
253
+ batch_size=None):
254
+ N = len(output_ids)
255
+ if batch_size is None:
256
+ num_segments = None
257
+ else:
258
+ num_segments = N * batch_size
259
+ # first concat all the ids and unique
260
+ if isinstance(lookup_indices, dict) and 'sparse_fea' in lookup_indices.keys():
261
+ # all_uniq_ids, uniq_idx, segment_lens = features['sparse_fea']
262
+ all_ids, segment_lens = lookup_indices['sparse_fea']
263
+ all_uniq_ids, uniq_idx = array_ops.unique(all_ids)
264
+ cumsum_lens = math_ops.cumsum(segment_lens)
265
+ segment_ids = array_ops.searchsorted(
266
+ cumsum_lens, math_ops.range(cumsum_lens[-1]), side='right')
267
+ elif isinstance(lookup_indices, dict) and 'ragged_ids' in lookup_indices.keys(
268
+ ) and 'ragged_lens' in lookup_indices.keys():
269
+ all_ids, segment_lens = lookup_indices['ragged_ids'], lookup_indices[
270
+ 'ragged_lens']
271
+ all_uniq_ids, uniq_idx = array_ops.unique(all_ids)
272
+ cumsum_lens = math_ops.cumsum(segment_lens)
273
+ segment_ids = array_ops.searchsorted(
274
+ cumsum_lens, math_ops.range(cumsum_lens[-1]), side='right')
275
+ elif isinstance(lookup_indices[0], sparse_tensor_lib.SparseTensor):
276
+ with ops.device('/cpu:0'):
277
+ all_ids = array_ops.concat([x.values for x in lookup_indices], axis=0)
278
+ segment_ids = array_ops.concat([x.indices[:, 0] for x in lookup_indices],
279
+ axis=0)
280
+ all_uniq_ids, uniq_idx = array_ops.unique(all_ids)
281
+ elif 'RaggedTensor' in str(type(lookup_indices[0])):
282
+ with ops.device('/cpu:0'):
283
+ all_ids = array_ops.concat([x.values for x in lookup_indices], axis=0)
284
+ segment_lens = array_ops.concat([x.row_lengths() for x in lookup_indices],
285
+ axis=0)
286
+ all_uniq_ids, uniq_idx = array_ops.unique(all_ids)
287
+ cumsum_lens = math_ops.cumsum(segment_lens)
288
+ segment_ids = array_ops.searchsorted(
289
+ cumsum_lens, math_ops.range(cumsum_lens[-1]), side='right')
290
+ else:
291
+ assert False, 'invalid indices type: %s' % str(type(lookup_indices[0]))
292
+
293
+ num_parts = hvd.size()
294
+ if num_parts > 1:
295
+ # dynamic partition
296
+ p_assignments = math_ops.cast(all_uniq_ids % num_parts, dtypes.int32)
297
+ gather_ids = data_flow_ops.dynamic_partition(all_uniq_ids, p_assignments,
298
+ num_parts)
299
+ original_ids = math_ops.range(array_ops.size(all_uniq_ids))
300
+ original_part_ids = data_flow_ops.dynamic_partition(original_ids,
301
+ p_assignments,
302
+ num_parts)
303
+ # all2all
304
+ split_sizes = array_ops.concat([array_ops.shape(x) for x in gather_ids],
305
+ axis=0)
306
+ send_ids = array_ops.concat(gather_ids, axis=0)
307
+ recv_ids, recv_lens = hvd.alltoall(send_ids, split_sizes)
308
+
309
+ # read embedding from dynamic variable
310
+ if isinstance(embedding, dynamic_variable.DynamicVariable):
311
+ send_embed = embedding.sparse_read(
312
+ recv_ids, lookup_only=(not is_training))
313
+ else:
314
+ # find in subarray position
315
+ # 0 2 4 6 8 10 ...
316
+ # 1 3 5 7 9 11 ...
317
+ recv_ids = math_ops.cast(recv_ids / num_parts, dtypes.int64)
318
+ send_embed = array_ops.gather(embedding, recv_ids)
319
+
320
+ # all2all
321
+ recv_embeddings, _ = hvd.alltoall(send_embed, recv_lens)
322
+ recv_embeddings = array_ops.split(
323
+ recv_embeddings, num_or_size_splits=split_sizes)
324
+ recv_embeddings = data_flow_ops.parallel_dynamic_stitch(
325
+ original_part_ids, recv_embeddings, name='parallel_dynamic_stitch')
326
+ embeddings = math_ops.sparse_segment_sum(
327
+ recv_embeddings,
328
+ uniq_idx,
329
+ segment_ids,
330
+ num_segments=num_segments,
331
+ name='sparse_segment_sum')
332
+ else:
333
+ if isinstance(embedding, dynamic_variable.DynamicVariable):
334
+ recv_embeddings = embedding.sparse_read(
335
+ all_uniq_ids, lookup_only=(not is_training))
336
+ else:
337
+ recv_embeddings = array_ops.gather(embedding, all_uniq_ids)
338
+ embeddings = math_ops.sparse_segment_sum(
339
+ recv_embeddings,
340
+ uniq_idx,
341
+ segment_ids,
342
+ num_segments=num_segments,
343
+ name='sparse_segment_sum')
344
+
345
+ embed_dim = embedding.get_shape()[-1]
346
+ output_tensor = array_ops.reshape(embeddings, [N, -1, embed_dim])
347
+
348
+ if output_tensors is not None:
349
+ outputs = array_ops.split(output_tensor, num_or_size_splits=N, axis=0)
350
+ for output, output_id in zip(outputs, output_ids):
351
+ output_tensors[output_id] = array_ops.squeeze(output, axis=0)
352
+
353
+ if batch_size is None:
354
+ batch_size = -1
355
+ return array_ops.reshape(
356
+ array_ops.transpose(output_tensor, perm=[1, 0, 2]),
357
+ [batch_size, N * embed_dim])
358
+
359
+
360
+ def _internal_input_layer(features,
361
+ feature_columns,
362
+ weight_collections=None,
363
+ trainable=True,
364
+ cols_to_vars=None,
365
+ scope=None,
366
+ cols_to_output_tensors=None,
367
+ from_template=False,
368
+ feature_name_to_output_tensors=None,
369
+ is_training=True):
370
+ """See input_layer, `scope` is a name or variable scope to use."""
371
+ feature_columns = _normalize_feature_columns(feature_columns)
372
+ for column in feature_columns:
373
+ if not isinstance(column, _DenseColumn):
374
+ raise ValueError(
375
+ 'Items of feature_columns must be a _DenseColumn. '
376
+ 'You can wrap a categorical column with an '
377
+ 'embedding_column or indicator_column. Given: {}'.format(column))
378
+ weight_collections = list(weight_collections or [])
379
+ if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
380
+ weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
381
+ if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
382
+ weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
383
+
384
+ def _get_logits(): # pylint: disable=missing-docstring
385
+ builder = _LazyBuilder(features)
386
+ output_tensors = []
387
+
388
+ tmp_cols = feature_columns
389
+ if embedding_utils.sort_col_by_name():
390
+ logging.info('will sort columns[len=%d] by name' % len(tmp_cols))
391
+ tmp_cols = sorted(tmp_cols, key=lambda x: x.name)
392
+ for column in tmp_cols:
393
+ with variable_scope.variable_scope(
394
+ None, default_name=column._var_scope_name): # pylint: disable=protected-access
395
+ tensor = column._get_dense_tensor( # pylint: disable=protected-access
396
+ builder,
397
+ weight_collections=weight_collections,
398
+ trainable=trainable)
399
+ num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access
400
+ batch_size = array_ops.shape(tensor)[0]
401
+ output_tensor = array_ops.reshape(
402
+ tensor, shape=(batch_size, num_elements))
403
+ output_tensors.append(output_tensor)
404
+ if cols_to_vars is not None:
405
+ # Retrieve any variables created (some _DenseColumn's don't create
406
+ # variables, in which case an empty list is returned).
407
+ cols_to_vars[column] = ops.get_collection(
408
+ ops.GraphKeys.GLOBAL_VARIABLES,
409
+ scope=variable_scope.get_variable_scope().name)
410
+ if cols_to_output_tensors is not None:
411
+ cols_to_output_tensors[column] = output_tensor
412
+ if feature_name_to_output_tensors is not None:
413
+ feature_name_to_output_tensors[column.raw_name] = output_tensor
414
+ return array_ops.concat(output_tensors, 1)
415
+
416
+ def _get_logits_embedding_parallel(): # pylint: disable=missing-docstring
417
+ assert hvd is not None, 'horovod is not installed'
418
+ builder = _LazyBuilder(features)
419
+
420
+ if embedding_utils.embedding_on_cpu():
421
+ embedding_device = '/cpu:0'
422
+ else:
423
+ embedding_device = '/gpu:0'
424
+
425
+ def _get_var_type(column):
426
+ if column.ev_params.use_cache:
427
+ return 'hybrid'
428
+ else:
429
+ return None
430
+
431
+ output_tensors = []
432
+ ordered_columns = []
433
+
434
+ lookup_embeddings = []
435
+ lookup_indices = None
436
+ lookup_combiners = []
437
+ lookup_cols = []
438
+ lookup_output_ids = []
439
+ lookup_wgts = []
440
+
441
+ dense_cols = []
442
+ dense_output_ids = []
443
+
444
+ shared_weights = {}
445
+ dense_cnt = 0
446
+
447
+ batch_sizes = []
448
+ for column in feature_columns:
449
+ ordered_columns.append(column)
450
+ with variable_scope.variable_scope(
451
+ None, default_name=column._var_scope_name): # pylint: disable=protected-access
452
+ # for features which does not require embedding
453
+ if 'Embedding' not in str(type(column)):
454
+ dense_cols.append(column)
455
+ dense_output_ids.append(len(output_tensors))
456
+ output_tensors.append(None)
457
+ dense_cnt += 1
458
+ continue
459
+
460
+ # for features require embedding
461
+ num_buckets = column.categorical_column.num_buckets + hvd.size() - 1
462
+ per_worker_buckets = num_buckets // hvd.size()
463
+ embedding_shape = (per_worker_buckets, column.dimension)
464
+ if 'SharedEmbedding' in str(type(column)):
465
+ shared_name = column.shared_embedding_collection_name
466
+ if shared_name in shared_weights:
467
+ embedding_weights = shared_weights[shared_name]
468
+ else:
469
+ with ops.device(embedding_device):
470
+ if column.ev_params is not None:
471
+ assert dynamic_variable is not None, 'sok is not installed'
472
+ embedding_weights = dynamic_variable.DynamicVariable(
473
+ name='embedding_weights',
474
+ dimension=column.dimension,
475
+ initializer='random {"stddev":0.0025}', # column.initializer,
476
+ var_type=_get_var_type(column),
477
+ trainable=column.trainable and trainable,
478
+ dtype=dtypes.float32,
479
+ init_capacity=column.ev_params.init_capacity,
480
+ max_capacity=column.ev_params.max_capacity)
481
+ else:
482
+ embedding_weights = variable_scope.get_variable(
483
+ name='embedding_weights',
484
+ shape=embedding_shape,
485
+ dtype=dtypes.float32,
486
+ initializer=column.initializer,
487
+ trainable=column.trainable and trainable,
488
+ partitioner=None,
489
+ collections=weight_collections)
490
+ shared_weights[shared_name] = embedding_weights
491
+ else:
492
+ with ops.device(embedding_device):
493
+ if column.ev_params is not None:
494
+ assert dynamic_variable is not None, 'sok is not installed'
495
+ embedding_weights = dynamic_variable.DynamicVariable(
496
+ name='embedding_weights',
497
+ dimension=column.dimension,
498
+ initializer='random {"stddev":0.0025}', # column.initializer,
499
+ var_type=_get_var_type(column),
500
+ trainable=column.trainable and trainable,
501
+ dtype=dtypes.float32,
502
+ init_capacity=column.ev_params.init_capacity,
503
+ max_capacity=column.ev_params.max_capacity)
504
+ else:
505
+ embedding_weights = variable_scope.get_variable(
506
+ name='embedding_weights',
507
+ shape=embedding_shape,
508
+ dtype=dtypes.float32,
509
+ initializer=column.initializer,
510
+ trainable=column.trainable and trainable,
511
+ partitioner=None,
512
+ collections=weight_collections)
513
+ lookup_embeddings.append(embedding_weights)
514
+ output_id = len(output_tensors)
515
+ output_tensors.append(None)
516
+ lookup_output_ids.append(output_id)
517
+ lookup_cols.append(column)
518
+ lookup_combiners.append(column.combiner)
519
+
520
+ # SparseTensor RaggedTensor
521
+ # features are not gathered into one, may have
522
+ # performance issues
523
+ if 'sparse_fea' in features.keys():
524
+ if lookup_indices is None:
525
+ lookup_indices = {'sparse_fea': features['sparse_fea']}
526
+ elif 'ragged_ids' in features.keys():
527
+ if lookup_indices is None:
528
+ lookup_indices = {
529
+ 'ragged_ids': features['ragged_ids'],
530
+ 'ragged_lens': features['ragged_lens']
531
+ }
532
+ if 'ragged_wgts' in features:
533
+ lookup_indices['ragged_wgts'] = features['ragged_wgts']
534
+ else:
535
+ if lookup_indices is None:
536
+ lookup_indices = []
537
+ with ops.device('/cpu:0'):
538
+ sparse_tensors = column.categorical_column._get_sparse_tensors(
539
+ builder,
540
+ weight_collections=weight_collections,
541
+ trainable=trainable)
542
+ lookup_indices.append(sparse_tensors.id_tensor)
543
+ if sparse_tensors.weight_tensor is not None:
544
+ lookup_wgts.append(sparse_tensors.weight_tensor)
545
+ if cols_to_vars is not None:
546
+ cols_to_vars[column] = ops.get_collection(
547
+ ops.GraphKeys.GLOBAL_VARIABLES,
548
+ scope=variable_scope.get_variable_scope().name)
549
+
550
+ if dense_cnt > 0:
551
+ if 'dense_fea' in features:
552
+ fea_dim_s = 0
553
+ for dense_output_id, dense_col in zip(dense_output_ids, dense_cols):
554
+ fea_dim_e = fea_dim_s + dense_col.shape[0]
555
+ output_tensors[dense_output_id] = features[
556
+ 'dense_fea'][:, fea_dim_s:fea_dim_e]
557
+ fea_dim_s = fea_dim_e
558
+ batch_sizes.append(array_ops.shape(features['dense_fea'])[0])
559
+ else:
560
+ for dense_output_id, dense_col in zip(dense_output_ids, dense_cols):
561
+ output_tensors[dense_output_id] = features[dense_col.raw_name]
562
+ batch_sizes.append(array_ops.shape(output_tensors[dense_output_id])[0])
563
+
564
+ for tmp_embed_var in set(lookup_embeddings):
565
+ ops.add_to_collection(constant.EmbeddingParallel, tmp_embed_var.name)
566
+
567
+ if len(batch_sizes) == 0:
568
+ batch_size = None
569
+ else:
570
+ batch_size = batch_sizes[0]
571
+ # do embedding parallel lookup
572
+ if len(lookup_output_ids) > 0:
573
+ packed_input = ('sparse_fea' in features or 'ragged_ids' in features)
574
+ if packed_input:
575
+ uniq_embed_cnt = len(set(lookup_embeddings))
576
+ assert uniq_embed_cnt == 1, 'only one uniq embed is support for packed inputs'
577
+ outputs = embedding_parallel_lookup(lookup_embeddings[0],
578
+ lookup_indices, lookup_output_ids,
579
+ is_training, output_tensors,
580
+ batch_size)
581
+ else:
582
+ if batch_size is None:
583
+ all_indices = []
584
+ for lookup_indice in lookup_indices:
585
+ all_indices.append(lookup_indice.indices[-1:, 0])
586
+ all_indices = array_ops.concat(all_indices, axis=0)
587
+ batch_size = math_ops.reduce_max(all_indices) + 1
588
+ # group lookup_embeddings
589
+ grouped_inputs = {}
590
+ for embedding, lookup_indice, output_id in zip(lookup_embeddings,
591
+ lookup_indices,
592
+ lookup_output_ids):
593
+ if embedding not in grouped_inputs:
594
+ grouped_inputs[embedding] = {
595
+ 'lookup_indice': [lookup_indice],
596
+ 'output_id': [output_id]
597
+ }
598
+ else:
599
+ grouped_inputs[embedding]['lookup_indice'].append(lookup_indice)
600
+ grouped_inputs[embedding]['output_id'].append(output_id)
601
+
602
+ for embedding in grouped_inputs:
603
+ lookup_indices = grouped_inputs[embedding]['lookup_indice']
604
+ output_ids = grouped_inputs[embedding]['output_id']
605
+ outputs = embedding_parallel_lookup(embedding, lookup_indices,
606
+ output_ids, is_training,
607
+ output_tensors, batch_size)
608
+
609
+ for output_tensor, col in zip(output_tensors, feature_columns):
610
+ if feature_name_to_output_tensors is not None:
611
+ feature_name_to_output_tensors[col.raw_name] = output_tensor
612
+ if cols_to_output_tensors is not None:
613
+ cols_to_output_tensors[col] = output_tensor
614
+
615
+ if packed_input and dense_cnt == 0:
616
+ return outputs
617
+ else:
618
+ return array_ops.concat(output_tensors, axis=1)
619
+ else:
620
+ for output_tensor, col in zip(output_tensors, feature_columns):
621
+ if feature_name_to_output_tensors is not None:
622
+ feature_name_to_output_tensors[col.raw_name] = output_tensor
623
+ if cols_to_output_tensors is not None:
624
+ cols_to_output_tensors[col] = output_tensor
625
+ return array_ops.concat(output_tensors, axis=1)
626
+
627
+ # If we're constructing from the `make_template`, that by default adds a
628
+ # variable scope with the name of the layer. In that case, we dont want to
629
+ # add another `variable_scope` as that would break checkpoints.
630
+ if from_template:
631
+ return _get_logits()
632
+ else:
633
+ with variable_scope.variable_scope(
634
+ scope, default_name='input_layer', values=features.values()):
635
+ if embedding_utils.is_embedding_parallel():
636
+ return _get_logits_embedding_parallel()
637
+ else:
638
+ with conditional(embedding_utils.embedding_on_cpu(),
639
+ ops.device('/cpu:0')):
640
+ return _get_logits()
641
+
642
+
643
+ def input_layer(features,
644
+ feature_columns,
645
+ weight_collections=None,
646
+ trainable=True,
647
+ cols_to_vars=None,
648
+ cols_to_output_tensors=None,
649
+ feature_name_to_output_tensors=None,
650
+ is_training=True):
651
+ """Returns a dense `Tensor` as input layer based on given `feature_columns`.
652
+
653
+ Generally a single example in training data is described with FeatureColumns.
654
+ At the first layer of the model, this column oriented data should be converted
655
+ to a single `Tensor`.
656
+
657
+ Example:
658
+
659
+ ```python
660
+ price = numeric_column('price')
661
+ keywords_embedded = embedding_column(
662
+ categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
663
+ columns = [price, keywords_embedded, ...]
664
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
665
+ dense_tensor = input_layer(features, columns)
666
+ for units in [128, 64, 32]:
667
+ dense_tensor = tf.compat.v1.layers.dense(dense_tensor, units, tf.nn.relu)
668
+ prediction = tf.compat.v1.layers.dense(dense_tensor, 1)
669
+ ```
670
+
671
+ Args:
672
+ features: A mapping from key to tensors. `_FeatureColumn`s look up via these
673
+ keys. For example `numeric_column('price')` will look at 'price' key in
674
+ this dict. Values can be a `SparseTensor` or a `Tensor` depends on
675
+ corresponding `_FeatureColumn`.
676
+ feature_columns: An iterable containing the FeatureColumns to use as inputs
677
+ to your model. All items should be instances of classes derived from
678
+ `_DenseColumn` such as `numeric_column`, `embedding_column`,
679
+ `bucketized_column`, `indicator_column`. If you have categorical features,
680
+ you can wrap them with an `embedding_column` or `indicator_column`.
681
+ weight_collections: A list of collection names to which the Variable will be
682
+ added. Note that variables will also be added to collections
683
+ `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
684
+ trainable: If `True` also add the variable to the graph collection
685
+ `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
686
+ cols_to_vars: If not `None`, must be a dictionary that will be filled with a
687
+ mapping from `_FeatureColumn` to list of `Variable`s. For example, after
688
+ the call, we might have cols_to_vars =
689
+ {_EmbeddingColumn(
690
+ categorical_column=_HashedCategoricalColumn(
691
+ key='sparse_feature', hash_bucket_size=5, dtype=tf.string),
692
+ dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10),
693
+ <tf.Variable 'some_variable:1' shape=(5, 10)]}
694
+ If a column creates no variables, its value will be an empty list.
695
+ cols_to_output_tensors: If not `None`, must be a dictionary that will be
696
+ filled with a mapping from '_FeatureColumn' to the associated
697
+ output `Tensor`s.
698
+
699
+ Returns:
700
+ A `Tensor` which represents input layer of a model. Its shape
701
+ is (batch_size, first_layer_dimension) and its dtype is `float32`.
702
+ first_layer_dimension is determined based on given `feature_columns`.
703
+
704
+ Raises:
705
+ ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
706
+ """
707
+ return _internal_input_layer(
708
+ features,
709
+ feature_columns,
710
+ weight_collections=weight_collections,
711
+ trainable=trainable,
712
+ cols_to_vars=cols_to_vars,
713
+ cols_to_output_tensors=cols_to_output_tensors,
714
+ feature_name_to_output_tensors=feature_name_to_output_tensors,
715
+ is_training=is_training)
716
+
717
+
718
+ # TODO(akshayka): InputLayer should be a subclass of Layer, and it
719
+ # should implement the logic in input_layer using Layer's build-and-call
720
+ # paradigm; input_layer should create an instance of InputLayer and
721
+ # return the result of invoking its apply method, just as functional layers do.
722
+ class InputLayer(object):
723
+ """An object-oriented version of `input_layer` that reuses variables."""
724
+
725
+ def __init__(self,
726
+ feature_columns,
727
+ weight_collections=None,
728
+ trainable=True,
729
+ cols_to_vars=None,
730
+ name='feature_column_input_layer',
731
+ create_scope_now=True):
732
+ """See `input_layer`."""
733
+ self._feature_columns = feature_columns
734
+ self._weight_collections = weight_collections
735
+ self._trainable = trainable
736
+ self._cols_to_vars = cols_to_vars
737
+ self._name = name
738
+ self._input_layer_template = template.make_template(
739
+ self._name, _internal_input_layer, create_scope_now_=create_scope_now)
740
+ self._scope = self._input_layer_template.variable_scope
741
+
742
+ def __call__(self, features):
743
+ return self._input_layer_template(
744
+ features=features,
745
+ feature_columns=self._feature_columns,
746
+ weight_collections=self._weight_collections,
747
+ trainable=self._trainable,
748
+ cols_to_vars=None,
749
+ from_template=True)
750
+
751
+ @property
752
+ def name(self):
753
+ return self._name
754
+
755
+ @property
756
+ def non_trainable_variables(self):
757
+ return self._input_layer_template.non_trainable_variables
758
+
759
+ @property
760
+ def non_trainable_weights(self):
761
+ return self._input_layer_template.non_trainable_weights
762
+
763
+ @property
764
+ def trainable_variables(self):
765
+ return self._input_layer_template.trainable_variables
766
+
767
+ @property
768
+ def trainable_weights(self):
769
+ return self._input_layer_template.trainable_weights
770
+
771
+ @property
772
+ def variables(self):
773
+ return self._input_layer_template.variables
774
+
775
+ @property
776
+ def weights(self):
777
+ return self._input_layer_template.weights
778
+
779
+
780
+ def linear_model(features,
781
+ feature_columns,
782
+ units=1,
783
+ sparse_combiner='sum',
784
+ weight_collections=None,
785
+ trainable=True,
786
+ cols_to_vars=None):
787
+ """Returns a linear prediction `Tensor` based on given `feature_columns`.
788
+
789
+ This function generates a weighted sum based on output dimension `units`.
790
+ Weighted sum refers to logits in classification problems. It refers to the
791
+ prediction itself for linear regression problems.
792
+
793
+ Note on supported columns: `linear_model` treats categorical columns as
794
+ `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
795
+ like:
796
+
797
+ ```python
798
+ shape = [2, 2]
799
+ {
800
+ [0, 0]: "a"
801
+ [1, 0]: "b"
802
+ [1, 1]: "c"
803
+ }
804
+ ```
805
+ `linear_model` assigns weights for the presence of "a", "b", "c' implicitly,
806
+ just like `indicator_column`, while `input_layer` explicitly requires wrapping
807
+ each of categorical columns with an `embedding_column` or an
808
+ `indicator_column`.
809
+
810
+ Example of usage:
811
+
812
+ ```python
813
+ price = numeric_column('price')
814
+ price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
815
+ keywords = categorical_column_with_hash_bucket("keywords", 10K)
816
+ keywords_price = crossed_column('keywords', price_buckets, ...)
817
+ columns = [price_buckets, keywords, keywords_price ...]
818
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
819
+ prediction = linear_model(features, columns)
820
+ ```
821
+
822
+ The `sparse_combiner` argument works as follows
823
+ For example, for two features represented as the categorical columns:
824
+
825
+ ```python
826
+ # Feature 1
827
+
828
+ shape = [2, 2]
829
+ {
830
+ [0, 0]: "a"
831
+ [0, 1]: "b"
832
+ [1, 0]: "c"
833
+ }
834
+
835
+ # Feature 2
836
+
837
+ shape = [2, 3]
838
+ {
839
+ [0, 0]: "d"
840
+ [1, 0]: "e"
841
+ [1, 1]: "f"
842
+ [1, 2]: "f"
843
+ }
844
+ ```
845
+
846
+ with `sparse_combiner` as "mean", the linear model outputs consequently
847
+ are:
848
+
849
+ ```
850
+ y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b
851
+ y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b
852
+ ```
853
+
854
+ where `y_i` is the output, `b` is the bias, and `w_x` is the weight
855
+ assigned to the presence of `x` in the input features.
856
+
857
+ Args:
858
+ features: A mapping from key to tensors. `_FeatureColumn`s look up via these
859
+ keys. For example `numeric_column('price')` will look at 'price' key in
860
+ this dict. Values are `Tensor` or `SparseTensor` depending on
861
+ corresponding `_FeatureColumn`.
862
+ feature_columns: An iterable containing the FeatureColumns to use as inputs
863
+ to your model. All items should be instances of classes derived from
864
+ `_FeatureColumn`s.
865
+ units: An integer, dimensionality of the output space. Default value is 1.
866
+ sparse_combiner: A string specifying how to reduce if a categorical column
867
+ is multivalent. Except `numeric_column`, almost all columns passed to
868
+ `linear_model` are considered as categorical columns. It combines each
869
+ categorical column independently. Currently "mean", "sqrtn" and "sum" are
870
+ supported, with "sum" the default for linear model. "sqrtn" often achieves
871
+ good accuracy, in particular with bag-of-words columns.
872
+ * "sum": do not normalize features in the column
873
+ * "mean": do l1 normalization on features in the column
874
+ * "sqrtn": do l2 normalization on features in the column
875
+ weight_collections: A list of collection names to which the Variable will be
876
+ added. Note that, variables will also be added to collections
877
+ `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
878
+ trainable: If `True` also add the variable to the graph collection
879
+ `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
880
+ cols_to_vars: If not `None`, must be a dictionary that will be filled with a
881
+ mapping from `_FeatureColumn` to associated list of `Variable`s. For
882
+ example, after the call, we might have cols_to_vars = {
883
+ _NumericColumn(
884
+ key='numeric_feature1', shape=(1,):
885
+ [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>],
886
+ 'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>],
887
+ _NumericColumn(
888
+ key='numeric_feature2', shape=(2,)):
889
+ [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]}
890
+ If a column creates no variables, its value will be an empty list. Note
891
+ that cols_to_vars will also contain a string key 'bias' that maps to a
892
+ list of Variables.
893
+
894
+ Returns:
895
+ A `Tensor` which represents predictions/logits of a linear model. Its shape
896
+ is (batch_size, units) and its dtype is `float32`.
897
+
898
+ Raises:
899
+ ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
900
+ nor `_CategoricalColumn`.
901
+ """
902
+ with variable_scope.variable_scope(None, 'linear_model') as vs:
903
+ model_name = _strip_leading_slashes(vs.name)
904
+ linear_model_layer = _LinearModel(
905
+ feature_columns=feature_columns,
906
+ units=units,
907
+ sparse_combiner=sparse_combiner,
908
+ weight_collections=weight_collections,
909
+ trainable=trainable,
910
+ name=model_name)
911
+ retval = linear_model_layer(features) # pylint: disable=not-callable
912
+ if cols_to_vars is not None:
913
+ cols_to_vars.update(linear_model_layer.cols_to_vars())
914
+ return retval
915
+
916
+
917
+ def _add_to_collections(var, weight_collections):
918
+ """Adds a var to the list of weight_collections provided.
919
+
920
+ Handles the case for partitioned and non-partitioned variables.
921
+
922
+ Args:
923
+ var: A variable or Partitioned Variable.
924
+ weight_collections: List of collections to add variable to.
925
+ """
926
+ for weight_collection in weight_collections:
927
+ # The layer self.add_variable call already adds it to GLOBAL_VARIABLES.
928
+ if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES:
929
+ continue
930
+ # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`
931
+ # so that we don't have to do this check.
932
+ if isinstance(var, variables.PartitionedVariable):
933
+ for constituent_var in list(var):
934
+ ops.add_to_collection(weight_collection, constituent_var)
935
+ else:
936
+ ops.add_to_collection(weight_collection, var)
937
+
938
+
939
+ class _FCLinearWrapper(base.Layer):
940
+ """Wraps a _FeatureColumn in a layer for use in a linear model.
941
+
942
+ See `linear_model` above.
943
+ """
944
+
945
+ def __init__(self,
946
+ feature_column,
947
+ units=1,
948
+ sparse_combiner='sum',
949
+ weight_collections=None,
950
+ trainable=True,
951
+ name=None,
952
+ **kwargs):
953
+ super(_FCLinearWrapper, self).__init__(
954
+ trainable=trainable, name=name, **kwargs)
955
+ self._feature_column = feature_column
956
+ self._units = units
957
+ self._sparse_combiner = sparse_combiner
958
+ self._weight_collections = weight_collections
959
+
960
+ def build(self, _):
961
+ if isinstance(self._feature_column, _CategoricalColumn):
962
+ weight = self.add_variable(
963
+ name='weights',
964
+ shape=(self._feature_column._num_buckets, self._units), # pylint: disable=protected-access
965
+ initializer=init_ops.zeros_initializer(),
966
+ trainable=self.trainable)
967
+ else:
968
+ num_elements = self._feature_column._variable_shape.num_elements() # pylint: disable=protected-access
969
+ weight = self.add_variable(
970
+ name='weights',
971
+ shape=[num_elements, self._units],
972
+ initializer=init_ops.zeros_initializer(),
973
+ trainable=self.trainable)
974
+ _add_to_collections(weight, self._weight_collections)
975
+ self._weight_var = weight
976
+ self.built = True
977
+
978
+ def call(self, builder):
979
+ weighted_sum = _create_weighted_sum(
980
+ column=self._feature_column,
981
+ builder=builder,
982
+ units=self._units,
983
+ sparse_combiner=self._sparse_combiner,
984
+ weight_collections=self._weight_collections,
985
+ trainable=self.trainable,
986
+ weight_var=self._weight_var)
987
+ return weighted_sum
988
+
989
+
990
+ class _BiasLayer(base.Layer):
991
+ """A layer for the bias term."""
992
+
993
+ def __init__(self,
994
+ units=1,
995
+ trainable=True,
996
+ weight_collections=None,
997
+ name=None,
998
+ **kwargs):
999
+ super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
1000
+ self._units = units
1001
+ self._weight_collections = weight_collections
1002
+
1003
+ def build(self, _):
1004
+ self._bias_variable = self.add_variable(
1005
+ 'bias_weights',
1006
+ shape=[self._units],
1007
+ initializer=init_ops.zeros_initializer(),
1008
+ trainable=self.trainable)
1009
+ _add_to_collections(self._bias_variable, self._weight_collections)
1010
+ self.built = True
1011
+
1012
+ def call(self, _):
1013
+ return self._bias_variable
1014
+
1015
+
1016
+ def _get_expanded_variable_list(variable):
1017
+ if (isinstance(variable, variables.Variable) or
1018
+ resource_variable_ops.is_resource_variable(variable)):
1019
+ return [variable] # Single variable case.
1020
+ else: # Must be a PartitionedVariable, so convert into a list.
1021
+ return list(variable)
1022
+
1023
+
1024
+ def _strip_leading_slashes(name):
1025
+ return name.rsplit('/', 1)[-1]
1026
+
1027
+
1028
+ class _LinearModel(training.Model):
1029
+ """Creates a linear model using feature columns.
1030
+
1031
+ See `linear_model` for details.
1032
+ """
1033
+
1034
+ def __init__(self,
1035
+ feature_columns,
1036
+ units=1,
1037
+ sparse_combiner='sum',
1038
+ weight_collections=None,
1039
+ trainable=True,
1040
+ name=None,
1041
+ **kwargs):
1042
+ super(_LinearModel, self).__init__(name=name, **kwargs)
1043
+ self._feature_columns = _normalize_feature_columns(feature_columns)
1044
+ self._weight_collections = list(weight_collections or [])
1045
+ if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections:
1046
+ self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
1047
+ if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
1048
+ self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
1049
+
1050
+ column_layers = {}
1051
+ for column in sorted(self._feature_columns, key=lambda x: x.name):
1052
+ with variable_scope.variable_scope(
1053
+ None, default_name=column._var_scope_name) as vs: # pylint: disable=protected-access
1054
+ # Having the fully expressed variable scope name ends up doubly
1055
+ # expressing the outer scope (scope with which this method was called)
1056
+ # in the name of the variable that would get created.
1057
+ column_name = _strip_leading_slashes(vs.name)
1058
+ column_layer = _FCLinearWrapper(column, units, sparse_combiner,
1059
+ self._weight_collections, trainable,
1060
+ column_name, **kwargs)
1061
+ column_layers[column_name] = column_layer
1062
+ self._column_layers = self._add_layers(column_layers)
1063
+ self._bias_layer = _BiasLayer(
1064
+ units=units,
1065
+ trainable=trainable,
1066
+ weight_collections=self._weight_collections,
1067
+ name='bias_layer',
1068
+ **kwargs)
1069
+ self._cols_to_vars = {}
1070
+
1071
+ def cols_to_vars(self):
1072
+ """Returns a dict mapping _FeatureColumns to variables.
1073
+
1074
+ See `linear_model` for more information.
1075
+ This is not populated till `call` is called i.e. layer is built.
1076
+ """
1077
+ return self._cols_to_vars
1078
+
1079
+ def call(self, features):
1080
+ with variable_scope.variable_scope(self.name):
1081
+ for column in self._feature_columns:
1082
+ if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
1083
+ raise ValueError(
1084
+ 'Items of feature_columns must be either a '
1085
+ '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
1086
+ weighted_sums = []
1087
+ ordered_columns = []
1088
+ builder = _LazyBuilder(features)
1089
+ for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
1090
+ column = layer._feature_column # pylint: disable=protected-access
1091
+ ordered_columns.append(column)
1092
+ weighted_sum = layer(builder)
1093
+ weighted_sums.append(weighted_sum)
1094
+ self._cols_to_vars[column] = ops.get_collection(
1095
+ ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
1096
+
1097
+ _verify_static_batch_size_equality(weighted_sums, ordered_columns)
1098
+ predictions_no_bias = math_ops.add_n(
1099
+ weighted_sums, name='weighted_sum_no_bias')
1100
+ predictions = nn_ops.bias_add(
1101
+ predictions_no_bias,
1102
+ self._bias_layer( # pylint: disable=not-callable
1103
+ builder,
1104
+ scope=variable_scope.get_variable_scope()), # pylint: disable=not-callable
1105
+ name='weighted_sum')
1106
+ bias = self._bias_layer.variables[0]
1107
+ self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
1108
+ return predictions
1109
+
1110
+ def _add_layers(self, layers):
1111
+ # "Magic" required for keras.Model classes to track all the variables in
1112
+ # a list of layers.Layer objects.
1113
+ # TODO(ashankar): Figure out API so user code doesn't have to do this.
1114
+ for name, layer in layers.items():
1115
+ setattr(self, 'layer-%s' % name, layer)
1116
+ return layers
1117
+
1118
+
1119
+ def _transform_features(features, feature_columns):
1120
+ """Returns transformed features based on features columns passed in.
1121
+
1122
+ Please note that most probably you would not need to use this function. Please
1123
+ check `input_layer` and `linear_model` to see whether they will
1124
+ satisfy your use case or not.
1125
+
1126
+ Example:
1127
+
1128
+ ```python
1129
+ # Define features and transformations
1130
+ crosses_a_x_b = crossed_column(
1131
+ columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
1132
+ price_buckets = bucketized_column(
1133
+ source_column=numeric_column("price"), boundaries=[...])
1134
+
1135
+ columns = [crosses_a_x_b, price_buckets]
1136
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1137
+ transformed = transform_features(features=features, feature_columns=columns)
1138
+
1139
+ assertCountEqual(columns, transformed.keys())
1140
+ ```
1141
+
1142
+ Args:
1143
+ features: A mapping from key to tensors. `_FeatureColumn`s look up via these
1144
+ keys. For example `numeric_column('price')` will look at 'price' key in
1145
+ this dict. Values can be a `SparseTensor` or a `Tensor` depends on
1146
+ corresponding `_FeatureColumn`.
1147
+ feature_columns: An iterable containing all the `_FeatureColumn`s.
1148
+
1149
+ Returns:
1150
+ A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
1151
+ """
1152
+ feature_columns = _normalize_feature_columns(feature_columns)
1153
+ outputs = {}
1154
+ with ops.name_scope(
1155
+ None, default_name='transform_features', values=features.values()):
1156
+ builder = _LazyBuilder(features)
1157
+ for column in sorted(feature_columns, key=lambda x: x.name):
1158
+ with ops.name_scope(None, default_name=column.name):
1159
+ outputs[column] = builder.get(column)
1160
+ return outputs
1161
+
1162
+
1163
+ def make_parse_example_spec(feature_columns):
1164
+ """Creates parsing spec dictionary from input feature_columns.
1165
+
1166
+ The returned dictionary can be used as arg 'features' in
1167
+ `tf.io.parse_example`.
1168
+
1169
+ Typical usage example:
1170
+
1171
+ ```python
1172
+ # Define features and transformations
1173
+ feature_a = categorical_column_with_vocabulary_file(...)
1174
+ feature_b = numeric_column(...)
1175
+ feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
1176
+ feature_a_x_feature_c = crossed_column(
1177
+ columns=["feature_a", feature_c_bucketized], ...)
1178
+
1179
+ feature_columns = set(
1180
+ [feature_b, feature_c_bucketized, feature_a_x_feature_c])
1181
+ features = tf.io.parse_example(
1182
+ serialized=serialized_examples,
1183
+ features=make_parse_example_spec(feature_columns))
1184
+ ```
1185
+
1186
+ For the above example, make_parse_example_spec would return the dict:
1187
+
1188
+ ```python
1189
+ {
1190
+ "feature_a": parsing_ops.VarLenFeature(tf.string),
1191
+ "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
1192
+ "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
1193
+ }
1194
+ ```
1195
+
1196
+ Args:
1197
+ feature_columns: An iterable containing all feature columns. All items
1198
+ should be instances of classes derived from `_FeatureColumn`.
1199
+
1200
+ Returns:
1201
+ A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
1202
+ value.
1203
+
1204
+ Raises:
1205
+ ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
1206
+ instance.
1207
+ """
1208
+ result = {}
1209
+ for column in feature_columns:
1210
+ if not isinstance(column, _FeatureColumn):
1211
+ raise ValueError('All feature_columns must be _FeatureColumn instances. '
1212
+ 'Given: {}'.format(column))
1213
+ config = column._parse_example_spec # pylint: disable=protected-access
1214
+ for key, value in six.iteritems(config):
1215
+ if key in result and value != result[key]:
1216
+ raise ValueError('feature_columns contain different parse_spec for key '
1217
+ '{}. Given {} and {}'.format(key, value, result[key]))
1218
+ result.update(config)
1219
+ return result
1220
+
1221
+
1222
+ def _embedding_column(categorical_column,
1223
+ dimension,
1224
+ combiner='mean',
1225
+ initializer=None,
1226
+ ckpt_to_load_from=None,
1227
+ tensor_name_in_ckpt=None,
1228
+ max_norm=None,
1229
+ trainable=True):
1230
+ """`_DenseColumn` that converts from sparse, categorical input.
1231
+
1232
+ Use this when your inputs are sparse, but you want to convert them to a dense
1233
+ representation (e.g., to feed to a DNN).
1234
+
1235
+ Inputs must be a `_CategoricalColumn` created by any of the
1236
+ `categorical_column_*` function. Here is an example of using
1237
+ `embedding_column` with `DNNClassifier`:
1238
+
1239
+ ```python
1240
+ video_id = categorical_column_with_identity(
1241
+ key='video_id', num_buckets=1000000, default_value=0)
1242
+ columns = [embedding_column(video_id, 9),...]
1243
+
1244
+ estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
1245
+
1246
+ label_column = ...
1247
+ def input_fn():
1248
+ features = tf.io.parse_example(
1249
+ ..., features=make_parse_example_spec(columns + [label_column]))
1250
+ labels = features.pop(label_column.name)
1251
+ return features, labels
1252
+
1253
+ estimator.train(input_fn=input_fn, steps=100)
1254
+ ```
1255
+
1256
+ Here is an example using `embedding_column` with model_fn:
1257
+
1258
+ ```python
1259
+ def model_fn(features, ...):
1260
+ video_id = categorical_column_with_identity(
1261
+ key='video_id', num_buckets=1000000, default_value=0)
1262
+ columns = [embedding_column(video_id, 9),...]
1263
+ dense_tensor = input_layer(features, columns)
1264
+ # Form DNN layers, calculate loss, and return EstimatorSpec.
1265
+ ...
1266
+ ```
1267
+
1268
+ Args:
1269
+ categorical_column: A `_CategoricalColumn` created by a
1270
+ `categorical_column_with_*` function. This column produces the sparse IDs
1271
+ that are inputs to the embedding lookup.
1272
+ dimension: An integer specifying dimension of the embedding, must be > 0.
1273
+ combiner: A string specifying how to reduce if there are multiple entries
1274
+ in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
1275
+ 'mean' the default. 'sqrtn' often achieves good accuracy, in particular
1276
+ with bag-of-words columns. Each of this can be thought as example level
1277
+ normalizations on the column. For more information, see
1278
+ `tf.embedding_lookup_sparse`.
1279
+ initializer: A variable initializer function to be used in embedding
1280
+ variable initialization. If not specified, defaults to
1281
+ `tf.compat.v1.truncated_normal_initializer` with mean `0.0` and
1282
+ standard deviation `1/sqrt(dimension)`.
1283
+ ckpt_to_load_from: String representing checkpoint name/pattern from which to
1284
+ restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
1285
+ tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
1286
+ which to restore the column weights. Required if `ckpt_to_load_from` is
1287
+ not `None`.
1288
+ max_norm: If not `None`, embedding values are l2-normalized to this value.
1289
+ trainable: Whether or not the embedding is trainable. Default is True.
1290
+
1291
+ Returns:
1292
+ `_DenseColumn` that converts from sparse input.
1293
+
1294
+ Raises:
1295
+ ValueError: if `dimension` not > 0.
1296
+ ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
1297
+ is specified.
1298
+ ValueError: if `initializer` is specified and is not callable.
1299
+ RuntimeError: If eager execution is enabled.
1300
+ """
1301
+ if (dimension is None) or (dimension < 1):
1302
+ raise ValueError('Invalid dimension {}.'.format(dimension))
1303
+ if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
1304
+ raise ValueError('Must specify both `ckpt_to_load_from` and '
1305
+ '`tensor_name_in_ckpt` or none of them.')
1306
+
1307
+ if (initializer is not None) and (not callable(initializer)):
1308
+ raise ValueError('initializer must be callable if specified. '
1309
+ 'Embedding of column_name: {}'.format(
1310
+ categorical_column.name))
1311
+ if initializer is None:
1312
+ initializer = init_ops.truncated_normal_initializer(
1313
+ mean=0.0, stddev=0.01 / math.sqrt(dimension))
1314
+
1315
+ embedding_shape = categorical_column._num_buckets, dimension # pylint: disable=protected-access
1316
+
1317
+ def _creator(weight_collections, scope):
1318
+ embedding_column_layer = _EmbeddingColumnLayer(
1319
+ embedding_shape=embedding_shape,
1320
+ initializer=initializer,
1321
+ weight_collections=weight_collections,
1322
+ trainable=trainable,
1323
+ name='embedding_column_layer')
1324
+ return embedding_column_layer(None, scope=scope) # pylint: disable=not-callable
1325
+
1326
+ return _EmbeddingColumn(
1327
+ categorical_column=categorical_column,
1328
+ dimension=dimension,
1329
+ combiner=combiner,
1330
+ layer_creator=_creator,
1331
+ ckpt_to_load_from=ckpt_to_load_from,
1332
+ tensor_name_in_ckpt=tensor_name_in_ckpt,
1333
+ max_norm=max_norm,
1334
+ trainable=trainable)
1335
+
1336
+
1337
+ def _numeric_column(key,
1338
+ shape=(1,),
1339
+ default_value=None,
1340
+ dtype=dtypes.float32,
1341
+ normalizer_fn=None):
1342
+ """Represents real valued or numerical features.
1343
+
1344
+ Example:
1345
+
1346
+ ```python
1347
+ price = numeric_column('price')
1348
+ columns = [price, ...]
1349
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1350
+ dense_tensor = input_layer(features, columns)
1351
+
1352
+ # or
1353
+ bucketized_price = bucketized_column(price, boundaries=[...])
1354
+ columns = [bucketized_price, ...]
1355
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1356
+ linear_prediction = linear_model(features, columns)
1357
+ ```
1358
+
1359
+ Args:
1360
+ key: A unique string identifying the input feature. It is used as the
1361
+ column name and the dictionary key for feature parsing configs, feature
1362
+ `Tensor` objects, and feature columns.
1363
+ shape: An iterable of integers specifies the shape of the `Tensor`. An
1364
+ integer can be given which means a single dimension `Tensor` with given
1365
+ width. The `Tensor` representing the column will have the shape of
1366
+ [batch_size] + `shape`.
1367
+ default_value: A single value compatible with `dtype` or an iterable of
1368
+ values compatible with `dtype` which the column takes on during
1369
+ `tf.Example` parsing if data is missing. A default value of `None` will
1370
+ cause `tf.io.parse_example` to fail if an example does not contain this
1371
+ column. If a single value is provided, the same value will be applied as
1372
+ the default value for every item. If an iterable of values is provided,
1373
+ the shape of the `default_value` should be equal to the given `shape`.
1374
+ dtype: defines the type of values. Default value is `tf.float32`. Must be a
1375
+ non-quantized, real integer or floating point type.
1376
+ normalizer_fn: If not `None`, a function that can be used to normalize the
1377
+ value of the tensor after `default_value` is applied for parsing.
1378
+ Normalizer function takes the input `Tensor` as its argument, and returns
1379
+ the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
1380
+ even though the most common use case of this function is normalization, it
1381
+ can be used for any kind of Tensorflow transformations.
1382
+
1383
+ Returns:
1384
+ A `_NumericColumn`.
1385
+
1386
+ Raises:
1387
+ TypeError: if any dimension in shape is not an int
1388
+ ValueError: if any dimension in shape is not a positive integer
1389
+ TypeError: if `default_value` is an iterable but not compatible with `shape`
1390
+ TypeError: if `default_value` is not compatible with `dtype`.
1391
+ ValueError: if `dtype` is not convertible to `tf.float32`.
1392
+ """
1393
+ shape = _check_shape(shape, key)
1394
+ if not (dtype.is_integer or dtype.is_floating):
1395
+ raise ValueError('dtype must be convertible to float. '
1396
+ 'dtype: {}, key: {}'.format(dtype, key))
1397
+ default_value = fc_utils.check_default_value(shape, default_value, dtype, key)
1398
+
1399
+ if normalizer_fn is not None and not callable(normalizer_fn):
1400
+ raise TypeError(
1401
+ 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
1402
+
1403
+ fc_utils.assert_key_is_string(key)
1404
+ return _NumericColumn(
1405
+ key,
1406
+ shape=shape,
1407
+ default_value=default_value,
1408
+ dtype=dtype,
1409
+ normalizer_fn=normalizer_fn)
1410
+
1411
+
1412
+ def _bucketized_column(source_column, boundaries):
1413
+ """Represents discretized dense input.
1414
+
1415
+ Buckets include the left boundary, and exclude the right boundary. Namely,
1416
+ `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
1417
+ `[1., 2.)`, and `[2., +inf)`.
1418
+
1419
+ For example, if the inputs are
1420
+
1421
+ ```python
1422
+ boundaries = [0, 10, 100]
1423
+ input tensor = [[-5, 10000]
1424
+ [150, 10]
1425
+ [5, 100]]
1426
+ ```
1427
+
1428
+ then the output will be
1429
+
1430
+ ```python
1431
+ output = [[0, 3]
1432
+ [3, 2]
1433
+ [1, 3]]
1434
+ ```
1435
+
1436
+ Example:
1437
+
1438
+ ```python
1439
+ price = numeric_column('price')
1440
+ bucketized_price = bucketized_column(price, boundaries=[...])
1441
+ columns = [bucketized_price, ...]
1442
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1443
+ linear_prediction = linear_model(features, columns)
1444
+
1445
+ # or
1446
+ columns = [bucketized_price, ...]
1447
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1448
+ dense_tensor = input_layer(features, columns)
1449
+ ```
1450
+
1451
+ `bucketized_column` can also be crossed with another categorical column using
1452
+ `crossed_column`:
1453
+
1454
+ ```python
1455
+ price = numeric_column('price')
1456
+ # bucketized_column converts numerical feature to a categorical one.
1457
+ bucketized_price = bucketized_column(price, boundaries=[...])
1458
+ # 'keywords' is a string feature.
1459
+ price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
1460
+ columns = [price_x_keywords, ...]
1461
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1462
+ linear_prediction = linear_model(features, columns)
1463
+ ```
1464
+
1465
+ Args:
1466
+ source_column: A one-dimensional dense column which is generated with
1467
+ `numeric_column`.
1468
+ boundaries: A sorted list or tuple of floats specifying the boundaries.
1469
+
1470
+ Returns:
1471
+ A `_BucketizedColumn`.
1472
+
1473
+ Raises:
1474
+ ValueError: If `source_column` is not a numeric column, or if it is not
1475
+ one-dimensional.
1476
+ ValueError: If `boundaries` is not a sorted list or tuple.
1477
+ """
1478
+ if not isinstance(source_column, _NumericColumn):
1479
+ raise ValueError(
1480
+ 'source_column must be a column generated with numeric_column(). '
1481
+ 'Given: {}'.format(source_column))
1482
+ if len(source_column.shape) > 1:
1483
+ raise ValueError('source_column must be one-dimensional column. '
1484
+ 'Given: {}'.format(source_column))
1485
+ if (not boundaries or
1486
+ not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
1487
+ raise ValueError('boundaries must be a sorted list.')
1488
+ for i in range(len(boundaries) - 1):
1489
+ if boundaries[i] >= boundaries[i + 1]:
1490
+ raise ValueError('boundaries must be a sorted list.')
1491
+ return _BucketizedColumn(source_column, tuple(boundaries))
1492
+
1493
+
1494
+ def _categorical_column_with_hash_bucket(key,
1495
+ hash_bucket_size,
1496
+ dtype=dtypes.string):
1497
+ """Represents sparse feature where ids are set by hashing.
1498
+
1499
+ Use this when your sparse features are in string or integer format, and you
1500
+ want to distribute your inputs into a finite number of buckets by hashing.
1501
+ output_id = Hash(input_feature_string) % bucket_size for string type input.
1502
+ For int type input, the value is converted to its string representation first
1503
+ and then hashed by the same formula.
1504
+
1505
+ For input dictionary `features`, `features[key]` is either `Tensor` or
1506
+ `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1507
+ and `''` for string, which will be dropped by this feature column.
1508
+
1509
+ Example:
1510
+
1511
+ ```python
1512
+ keywords = categorical_column_with_hash_bucket("keywords", 10K)
1513
+ columns = [keywords, ...]
1514
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1515
+ linear_prediction = linear_model(features, columns)
1516
+
1517
+ # or
1518
+ keywords_embedded = embedding_column(keywords, 16)
1519
+ columns = [keywords_embedded, ...]
1520
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1521
+ dense_tensor = input_layer(features, columns)
1522
+ ```
1523
+
1524
+ Args:
1525
+ key: A unique string identifying the input feature. It is used as the
1526
+ column name and the dictionary key for feature parsing configs, feature
1527
+ `Tensor` objects, and feature columns.
1528
+ hash_bucket_size: An int > 1. The number of buckets.
1529
+ dtype: The type of features. Only string and integer types are supported.
1530
+
1531
+ Returns:
1532
+ A `_HashedCategoricalColumn`.
1533
+
1534
+ Raises:
1535
+ ValueError: `hash_bucket_size` is not greater than 1.
1536
+ ValueError: `dtype` is neither string nor integer.
1537
+ """
1538
+ if hash_bucket_size is None:
1539
+ raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
1540
+
1541
+ if hash_bucket_size < 1:
1542
+ raise ValueError('hash_bucket_size must be at least 1. '
1543
+ 'hash_bucket_size: {}, key: {}'.format(
1544
+ hash_bucket_size, key))
1545
+
1546
+ fc_utils.assert_key_is_string(key)
1547
+ fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1548
+
1549
+ return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
1550
+
1551
+
1552
+ def _categorical_column_with_vocabulary_file(key,
1553
+ vocabulary_file,
1554
+ vocabulary_size=None,
1555
+ num_oov_buckets=0,
1556
+ default_value=None,
1557
+ dtype=dtypes.string):
1558
+ """A `_CategoricalColumn` with a vocabulary file.
1559
+
1560
+ Use this when your inputs are in string or integer format, and you have a
1561
+ vocabulary file that maps each value to an integer ID. By default,
1562
+ out-of-vocabulary values are ignored. Use either (but not both) of
1563
+ `num_oov_buckets` and `default_value` to specify how to include
1564
+ out-of-vocabulary values.
1565
+
1566
+ For input dictionary `features`, `features[key]` is either `Tensor` or
1567
+ `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1568
+ and `''` for string, which will be dropped by this feature column.
1569
+
1570
+ Example with `num_oov_buckets`:
1571
+ File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
1572
+ abbreviation. All inputs with values in that file are assigned an ID 0-49,
1573
+ corresponding to its line number. All other values are hashed and assigned an
1574
+ ID 50-54.
1575
+
1576
+ ```python
1577
+ states = categorical_column_with_vocabulary_file(
1578
+ key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
1579
+ num_oov_buckets=5)
1580
+ columns = [states, ...]
1581
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1582
+ linear_prediction = linear_model(features, columns)
1583
+ ```
1584
+
1585
+ Example with `default_value`:
1586
+ File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
1587
+ other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
1588
+ in input, and other values missing from the file, will be assigned ID 0. All
1589
+ others are assigned the corresponding line number 1-50.
1590
+
1591
+ ```python
1592
+ states = categorical_column_with_vocabulary_file(
1593
+ key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
1594
+ default_value=0)
1595
+ columns = [states, ...]
1596
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1597
+ linear_prediction, _, _ = linear_model(features, columns)
1598
+ ```
1599
+
1600
+ And to make an embedding with either:
1601
+
1602
+ ```python
1603
+ columns = [embedding_column(states, 3),...]
1604
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1605
+ dense_tensor = input_layer(features, columns)
1606
+ ```
1607
+
1608
+ Args:
1609
+ key: A unique string identifying the input feature. It is used as the
1610
+ column name and the dictionary key for feature parsing configs, feature
1611
+ `Tensor` objects, and feature columns.
1612
+ vocabulary_file: The vocabulary file name.
1613
+ vocabulary_size: Number of the elements in the vocabulary. This must be no
1614
+ greater than length of `vocabulary_file`, if less than length, later
1615
+ values are ignored. If None, it is set to the length of `vocabulary_file`.
1616
+ num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1617
+ buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1618
+ `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
1619
+ the input value. A positive `num_oov_buckets` can not be specified with
1620
+ `default_value`.
1621
+ default_value: The integer ID value to return for out-of-vocabulary feature
1622
+ values, defaults to `-1`. This can not be specified with a positive
1623
+ `num_oov_buckets`.
1624
+ dtype: The type of features. Only string and integer types are supported.
1625
+
1626
+ Returns:
1627
+ A `_CategoricalColumn` with a vocabulary file.
1628
+
1629
+ Raises:
1630
+ ValueError: `vocabulary_file` is missing or cannot be opened.
1631
+ ValueError: `vocabulary_size` is missing or < 1.
1632
+ ValueError: `num_oov_buckets` is a negative integer.
1633
+ ValueError: `num_oov_buckets` and `default_value` are both specified.
1634
+ ValueError: `dtype` is neither string nor integer.
1635
+ """
1636
+ if not vocabulary_file:
1637
+ raise ValueError('Missing vocabulary_file in {}.'.format(key))
1638
+
1639
+ if vocabulary_size is None:
1640
+ if not gfile.Exists(vocabulary_file):
1641
+ raise ValueError('vocabulary_file in {} does not exist.'.format(key))
1642
+
1643
+ with gfile.GFile(vocabulary_file) as f:
1644
+ vocabulary_size = sum(1 for _ in f)
1645
+ logging.info(
1646
+ 'vocabulary_size = %d in %s is inferred from the number of elements '
1647
+ 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
1648
+
1649
+ # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
1650
+ if vocabulary_size < 1:
1651
+ raise ValueError('Invalid vocabulary_size in {}.'.format(key))
1652
+ if num_oov_buckets:
1653
+ if default_value is not None:
1654
+ raise ValueError(
1655
+ 'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1656
+ key))
1657
+ if num_oov_buckets < 0:
1658
+ raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1659
+ num_oov_buckets, key))
1660
+ fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1661
+ fc_utils.assert_key_is_string(key)
1662
+ return _VocabularyFileCategoricalColumn(
1663
+ key=key,
1664
+ vocabulary_file=vocabulary_file,
1665
+ vocabulary_size=vocabulary_size,
1666
+ num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
1667
+ default_value=-1 if default_value is None else default_value,
1668
+ dtype=dtype)
1669
+
1670
+
1671
+ def _categorical_column_with_vocabulary_list(key,
1672
+ vocabulary_list,
1673
+ dtype=None,
1674
+ default_value=-1,
1675
+ num_oov_buckets=0):
1676
+ """A `_CategoricalColumn` with in-memory vocabulary.
1677
+
1678
+ Use this when your inputs are in string or integer format, and you have an
1679
+ in-memory vocabulary mapping each value to an integer ID. By default,
1680
+ out-of-vocabulary values are ignored. Use either (but not both) of
1681
+ `num_oov_buckets` and `default_value` to specify how to include
1682
+ out-of-vocabulary values.
1683
+
1684
+ For input dictionary `features`, `features[key]` is either `Tensor` or
1685
+ `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1686
+ and `''` for string, which will be dropped by this feature column.
1687
+
1688
+ Example with `num_oov_buckets`:
1689
+ In the following example, each input in `vocabulary_list` is assigned an ID
1690
+ 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other
1691
+ inputs are hashed and assigned an ID 4-5.
1692
+
1693
+ ```python
1694
+ colors = categorical_column_with_vocabulary_list(
1695
+ key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
1696
+ num_oov_buckets=2)
1697
+ columns = [colors, ...]
1698
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1699
+ linear_prediction, _, _ = linear_model(features, columns)
1700
+ ```
1701
+
1702
+ Example with `default_value`:
1703
+ In the following example, each input in `vocabulary_list` is assigned an ID
1704
+ 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other
1705
+ inputs are assigned `default_value` 0.
1706
+
1707
+
1708
+ ```python
1709
+ colors = categorical_column_with_vocabulary_list(
1710
+ key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
1711
+ columns = [colors, ...]
1712
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1713
+ linear_prediction, _, _ = linear_model(features, columns)
1714
+ ```
1715
+
1716
+ And to make an embedding with either:
1717
+
1718
+ ```python
1719
+ columns = [embedding_column(colors, 3),...]
1720
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1721
+ dense_tensor = input_layer(features, columns)
1722
+ ```
1723
+
1724
+ Args:
1725
+ key: A unique string identifying the input feature. It is used as the
1726
+ column name and the dictionary key for feature parsing configs, feature
1727
+ `Tensor` objects, and feature columns.
1728
+ vocabulary_list: An ordered iterable defining the vocabulary. Each feature
1729
+ is mapped to the index of its value (if present) in `vocabulary_list`.
1730
+ Must be castable to `dtype`.
1731
+ dtype: The type of features. Only string and integer types are supported.
1732
+ If `None`, it will be inferred from `vocabulary_list`.
1733
+ default_value: The integer ID value to return for out-of-vocabulary feature
1734
+ values, defaults to `-1`. This can not be specified with a positive
1735
+ `num_oov_buckets`.
1736
+ num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1737
+ buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1738
+ `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
1739
+ hash of the input value. A positive `num_oov_buckets` can not be specified
1740
+ with `default_value`.
1741
+
1742
+ Returns:
1743
+ A `_CategoricalColumn` with in-memory vocabulary.
1744
+
1745
+ Raises:
1746
+ ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
1747
+ ValueError: `num_oov_buckets` is a negative integer.
1748
+ ValueError: `num_oov_buckets` and `default_value` are both specified.
1749
+ ValueError: if `dtype` is not integer or string.
1750
+ """
1751
+ if (vocabulary_list is None) or (len(vocabulary_list) < 1):
1752
+ raise ValueError(
1753
+ 'vocabulary_list {} must be non-empty, column_name: {}'.format(
1754
+ vocabulary_list, key))
1755
+ if len(set(vocabulary_list)) != len(vocabulary_list):
1756
+ raise ValueError(
1757
+ 'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
1758
+ vocabulary_list, key))
1759
+ vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
1760
+ if num_oov_buckets:
1761
+ if default_value != -1:
1762
+ raise ValueError(
1763
+ 'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1764
+ key))
1765
+ if num_oov_buckets < 0:
1766
+ raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1767
+ num_oov_buckets, key))
1768
+ fc_utils.assert_string_or_int(
1769
+ vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
1770
+ if dtype is None:
1771
+ dtype = vocabulary_dtype
1772
+ elif dtype.is_integer != vocabulary_dtype.is_integer:
1773
+ raise ValueError(
1774
+ 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
1775
+ dtype, vocabulary_dtype, key))
1776
+ fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1777
+ fc_utils.assert_key_is_string(key)
1778
+
1779
+ return _VocabularyListCategoricalColumn(
1780
+ key=key,
1781
+ vocabulary_list=tuple(vocabulary_list),
1782
+ dtype=dtype,
1783
+ default_value=default_value,
1784
+ num_oov_buckets=num_oov_buckets)
1785
+
1786
+
1787
+ def _categorical_column_with_identity(key, num_buckets, default_value=None):
1788
+ """A `_CategoricalColumn` that returns identity values.
1789
+
1790
+ Use this when your inputs are integers in the range `[0, num_buckets)`, and
1791
+ you want to use the input value itself as the categorical ID. Values outside
1792
+ this range will result in `default_value` if specified, otherwise it will
1793
+ fail.
1794
+
1795
+ Typically, this is used for contiguous ranges of integer indexes, but
1796
+ it doesn't have to be. This might be inefficient, however, if many of IDs
1797
+ are unused. Consider `categorical_column_with_hash_bucket` in that case.
1798
+
1799
+ For input dictionary `features`, `features[key]` is either `Tensor` or
1800
+ `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1801
+ and `''` for string, which will be dropped by this feature column.
1802
+
1803
+ In the following examples, each input in the range `[0, 1000000)` is assigned
1804
+ the same value. All other inputs are assigned `default_value` 0. Note that a
1805
+ literal 0 in inputs will result in the same default ID.
1806
+
1807
+ Linear model:
1808
+
1809
+ ```python
1810
+ video_id = categorical_column_with_identity(
1811
+ key='video_id', num_buckets=1000000, default_value=0)
1812
+ columns = [video_id, ...]
1813
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1814
+ linear_prediction, _, _ = linear_model(features, columns)
1815
+ ```
1816
+
1817
+ Embedding for a DNN model:
1818
+
1819
+ ```python
1820
+ columns = [embedding_column(video_id, 9),...]
1821
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1822
+ dense_tensor = input_layer(features, columns)
1823
+ ```
1824
+
1825
+ Args:
1826
+ key: A unique string identifying the input feature. It is used as the
1827
+ column name and the dictionary key for feature parsing configs, feature
1828
+ `Tensor` objects, and feature columns.
1829
+ num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
1830
+ default_value: If `None`, this column's graph operations will fail for
1831
+ out-of-range inputs. Otherwise, this value must be in the range
1832
+ `[0, num_buckets)`, and will replace inputs in that range.
1833
+
1834
+ Returns:
1835
+ A `_CategoricalColumn` that returns identity values.
1836
+
1837
+ Raises:
1838
+ ValueError: if `num_buckets` is less than one.
1839
+ ValueError: if `default_value` is not in range `[0, num_buckets)`.
1840
+ """
1841
+ if num_buckets < 1:
1842
+ raise ValueError('num_buckets {} < 1, column_name {}'.format(
1843
+ num_buckets, key))
1844
+ if (default_value is not None) and ((default_value < 0) or
1845
+ (default_value >= num_buckets)):
1846
+ raise ValueError(
1847
+ 'default_value {} not in range [0, {}), column_name {}'.format(
1848
+ default_value, num_buckets, key))
1849
+ fc_utils.assert_key_is_string(key)
1850
+ return _IdentityCategoricalColumn(
1851
+ key=key, num_buckets=num_buckets, default_value=default_value)
1852
+
1853
+
1854
+ def _indicator_column(categorical_column):
1855
+ """Represents multi-hot representation of given categorical column.
1856
+
1857
+ - For DNN model, `indicator_column` can be used to wrap any
1858
+ `categorical_column_*` (e.g., to feed to DNN). Consider to Use
1859
+ `embedding_column` if the number of buckets/unique(values) are large.
1860
+
1861
+ - For Wide (aka linear) model, `indicator_column` is the internal
1862
+ representation for categorical column when passing categorical column
1863
+ directly (as any element in feature_columns) to `linear_model`. See
1864
+ `linear_model` for details.
1865
+
1866
+ ```python
1867
+ name = indicator_column(categorical_column_with_vocabulary_list(
1868
+ 'name', ['bob', 'george', 'wanda'])
1869
+ columns = [name, ...]
1870
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1871
+ dense_tensor = input_layer(features, columns)
1872
+
1873
+ dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"]
1874
+ dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"]
1875
+ dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"]
1876
+ ```
1877
+
1878
+ Args:
1879
+ categorical_column: A `_CategoricalColumn` which is created by
1880
+ `categorical_column_with_*` or `crossed_column` functions.
1881
+
1882
+ Returns:
1883
+ An `_IndicatorColumn`.
1884
+ """
1885
+ return _IndicatorColumn(categorical_column)
1886
+
1887
+
1888
+ def _weighted_categorical_column(categorical_column,
1889
+ weight_feature_key,
1890
+ dtype=dtypes.float32):
1891
+ """Applies weight values to a `_CategoricalColumn`.
1892
+
1893
+ Use this when each of your sparse inputs has both an ID and a value. For
1894
+ example, if you're representing text documents as a collection of word
1895
+ frequencies, you can provide 2 parallel sparse input features ('terms' and
1896
+ 'frequencies' below).
1897
+
1898
+ Example:
1899
+
1900
+ Input `tf.Example` objects:
1901
+
1902
+ ```proto
1903
+ [
1904
+ features {
1905
+ feature {
1906
+ key: "terms"
1907
+ value {bytes_list {value: "very" value: "model"}}
1908
+ }
1909
+ feature {
1910
+ key: "frequencies"
1911
+ value {float_list {value: 0.3 value: 0.1}}
1912
+ }
1913
+ },
1914
+ features {
1915
+ feature {
1916
+ key: "terms"
1917
+ value {bytes_list {value: "when" value: "course" value: "human"}}
1918
+ }
1919
+ feature {
1920
+ key: "frequencies"
1921
+ value {float_list {value: 0.4 value: 0.1 value: 0.2}}
1922
+ }
1923
+ }
1924
+ ]
1925
+ ```
1926
+
1927
+ ```python
1928
+ categorical_column = categorical_column_with_hash_bucket(
1929
+ column_name='terms', hash_bucket_size=1000)
1930
+ weighted_column = weighted_categorical_column(
1931
+ categorical_column=categorical_column, weight_feature_key='frequencies')
1932
+ columns = [weighted_column, ...]
1933
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1934
+ linear_prediction, _, _ = linear_model(features, columns)
1935
+ ```
1936
+
1937
+ This assumes the input dictionary contains a `SparseTensor` for key
1938
+ 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
1939
+ the same indices and dense shape.
1940
+
1941
+ Args:
1942
+ categorical_column: A `_CategoricalColumn` created by
1943
+ `categorical_column_with_*` functions.
1944
+ weight_feature_key: String key for weight values.
1945
+ dtype: Type of weights, such as `tf.float32`. Only float and integer weights
1946
+ are supported.
1947
+
1948
+ Returns:
1949
+ A `_CategoricalColumn` composed of two sparse features: one represents id,
1950
+ the other represents weight (value) of the id feature in that example.
1951
+
1952
+ Raises:
1953
+ ValueError: if `dtype` is not convertible to float.
1954
+ """
1955
+ if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
1956
+ raise ValueError('dtype {} is not convertible to float.'.format(dtype))
1957
+ return _WeightedCategoricalColumn(
1958
+ categorical_column=categorical_column,
1959
+ weight_feature_key=weight_feature_key,
1960
+ dtype=dtype)
1961
+
1962
+
1963
+ def _crossed_column(keys, hash_bucket_size, hash_key=None):
1964
+ """Returns a column for performing crosses of categorical features.
1965
+
1966
+ Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
1967
+ the transformation can be thought of as:
1968
+ Hash(cartesian product of features) % `hash_bucket_size`
1969
+
1970
+ For example, if the input features are:
1971
+
1972
+ * SparseTensor referred by first key:
1973
+
1974
+ ```python
1975
+ shape = [2, 2]
1976
+ {
1977
+ [0, 0]: "a"
1978
+ [1, 0]: "b"
1979
+ [1, 1]: "c"
1980
+ }
1981
+ ```
1982
+
1983
+ * SparseTensor referred by second key:
1984
+
1985
+ ```python
1986
+ shape = [2, 1]
1987
+ {
1988
+ [0, 0]: "d"
1989
+ [1, 0]: "e"
1990
+ }
1991
+ ```
1992
+
1993
+ then crossed feature will look like:
1994
+
1995
+ ```python
1996
+ shape = [2, 2]
1997
+ {
1998
+ [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
1999
+ [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
2000
+ [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
2001
+ }
2002
+ ```
2003
+
2004
+ Here is an example to create a linear model with crosses of string features:
2005
+
2006
+ ```python
2007
+ keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
2008
+ columns = [keywords_x_doc_terms, ...]
2009
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
2010
+ linear_prediction = linear_model(features, columns)
2011
+ ```
2012
+
2013
+ You could also use vocabulary lookup before crossing:
2014
+
2015
+ ```python
2016
+ keywords = categorical_column_with_vocabulary_file(
2017
+ 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
2018
+ keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
2019
+ columns = [keywords_x_doc_terms, ...]
2020
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
2021
+ linear_prediction = linear_model(features, columns)
2022
+ ```
2023
+
2024
+ If an input feature is of numeric type, you can use
2025
+ `categorical_column_with_identity`, or `bucketized_column`, as in the example:
2026
+
2027
+ ```python
2028
+ # vertical_id is an integer categorical feature.
2029
+ vertical_id = categorical_column_with_identity('vertical_id', 10K)
2030
+ price = numeric_column('price')
2031
+ # bucketized_column converts numerical feature to a categorical one.
2032
+ bucketized_price = bucketized_column(price, boundaries=[...])
2033
+ vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
2034
+ columns = [vertical_id_x_price, ...]
2035
+ features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
2036
+ linear_prediction = linear_model(features, columns)
2037
+ ```
2038
+
2039
+ To use crossed column in DNN model, you need to add it in an embedding column
2040
+ as in this example:
2041
+
2042
+ ```python
2043
+ vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
2044
+ vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
2045
+ dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
2046
+ ```
2047
+
2048
+ Args:
2049
+ keys: An iterable identifying the features to be crossed. Each element can
2050
+ be either:
2051
+ * string: Will use the corresponding feature which must be of string type.
2052
+ * `_CategoricalColumn`: Will use the transformed tensor produced by this
2053
+ column. Does not support hashed categorical column.
2054
+ hash_bucket_size: An int > 1. The number of buckets.
2055
+ hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
2056
+ function to combine the crosses fingerprints on SparseCrossOp (optional).
2057
+
2058
+ Returns:
2059
+ A `_CrossedColumn`.
2060
+
2061
+ Raises:
2062
+ ValueError: If `len(keys) < 2`.
2063
+ ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
2064
+ ValueError: If any of the keys is `_HashedCategoricalColumn`.
2065
+ ValueError: If `hash_bucket_size < 1`.
2066
+ """
2067
+ if not hash_bucket_size or hash_bucket_size < 1:
2068
+ raise ValueError('hash_bucket_size must be > 1. '
2069
+ 'hash_bucket_size: {}'.format(hash_bucket_size))
2070
+ if not keys or len(keys) < 2:
2071
+ raise ValueError(
2072
+ 'keys must be a list with length > 1. Given: {}'.format(keys))
2073
+ for key in keys:
2074
+ if (not isinstance(key, six.string_types) and
2075
+ not isinstance(key, _CategoricalColumn)):
2076
+ raise ValueError(
2077
+ 'Unsupported key type. All keys must be either string, or '
2078
+ 'categorical column except _HashedCategoricalColumn. '
2079
+ 'Given: {}'.format(key))
2080
+ if isinstance(key, _HashedCategoricalColumn):
2081
+ raise ValueError(
2082
+ 'categorical_column_with_hash_bucket is not supported for crossing. '
2083
+ 'Hashing before crossing will increase probability of collision. '
2084
+ 'Instead, use the feature name as a string. Given: {}'.format(key))
2085
+ return _CrossedColumn(
2086
+ keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
2087
+
2088
+
2089
+ # TODO(rohanj): Clearly define semantics of this layer.
2090
+ class _EmbeddingColumnLayer(base.Layer):
2091
+ """A layer that stores all the state required for a embedding column."""
2092
+
2093
+ def __init__(self,
2094
+ embedding_shape,
2095
+ initializer,
2096
+ weight_collections=None,
2097
+ trainable=True,
2098
+ name=None,
2099
+ **kwargs):
2100
+ """Constructor.
2101
+
2102
+ Args:
2103
+ embedding_shape: Shape of the embedding variable used for lookup.
2104
+ initializer: A variable initializer function to be used in embedding
2105
+ variable initialization.
2106
+ weight_collections: A list of collection names to which the Variable will
2107
+ be added. Note that, variables will also be added to collections
2108
+ `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
2109
+ trainable: If `True` also add the variable to the graph collection
2110
+ `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
2111
+ name: Name of the layer
2112
+ **kwargs: keyword named properties.
2113
+ """
2114
+ super(_EmbeddingColumnLayer, self).__init__(
2115
+ trainable=trainable, name=name, **kwargs)
2116
+ self._embedding_shape = embedding_shape
2117
+ self._initializer = initializer
2118
+ self._weight_collections = weight_collections
2119
+
2120
+ def set_weight_collections(self, weight_collections):
2121
+ """Sets the weight collections for the layer.
2122
+
2123
+ Args:
2124
+ weight_collections: A list of collection names to which the Variable will
2125
+ be added.
2126
+ """
2127
+ self._weight_collections = weight_collections
2128
+
2129
+ def build(self, _):
2130
+ self._embedding_weight_var = self.add_variable(
2131
+ name='embedding_weights',
2132
+ shape=self._embedding_shape,
2133
+ dtype=dtypes.float32,
2134
+ initializer=self._initializer,
2135
+ trainable=self.trainable)
2136
+ if self._weight_collections and not context.executing_eagerly():
2137
+ _add_to_collections(self._embedding_weight_var, self._weight_collections)
2138
+ self.built = True
2139
+
2140
+ def call(self, _):
2141
+ return self._embedding_weight_var
2142
+
2143
+
2144
+ @six.add_metaclass(abc.ABCMeta)
2145
+ class _FeatureColumn(object):
2146
+ """Represents a feature column abstraction.
2147
+
2148
+ WARNING: Do not subclass this layer unless you know what you are doing:
2149
+ the API is subject to future changes.
2150
+
2151
+ To distinguish the concept of a feature family and a specific binary feature
2152
+ within a family, we refer to a feature family like "country" as a feature
2153
+ column. Following is an example feature in a `tf.Example` format:
2154
+ {key: "country", value: [ "US" ]}
2155
+ In this example the value of feature is "US" and "country" refers to the
2156
+ column of the feature.
2157
+
2158
+ This class is an abstract class. User should not create instances of this.
2159
+ """
2160
+
2161
+ @abc.abstractproperty
2162
+ def name(self):
2163
+ """Returns string, Used for naming and for name_scope."""
2164
+ pass
2165
+
2166
+ @property
2167
+ def raw_name(self):
2168
+ return self.name
2169
+
2170
+ @property
2171
+ def _var_scope_name(self):
2172
+ """Returns string, Used for variable_scope, Defaults to self.name."""
2173
+ return self.name
2174
+
2175
+ @abc.abstractmethod
2176
+ def _transform_feature(self, inputs):
2177
+ """Returns intermediate representation (usually a `Tensor`).
2178
+
2179
+ Uses `inputs` to create an intermediate representation (usually a `Tensor`)
2180
+ that other feature columns can use.
2181
+
2182
+ Example usage of `inputs`:
2183
+ Let's say a Feature column depends on raw feature ('raw') and another
2184
+ `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
2185
+ be used as follows:
2186
+
2187
+ ```python
2188
+ raw_tensor = inputs.get('raw')
2189
+ fc_tensor = inputs.get(input_fc)
2190
+ ```
2191
+
2192
+ Args:
2193
+ inputs: A `_LazyBuilder` object to access inputs.
2194
+
2195
+ Returns:
2196
+ Transformed feature `Tensor`.
2197
+ """
2198
+ pass
2199
+
2200
+ @abc.abstractproperty
2201
+ def _parse_example_spec(self):
2202
+ """Returns a `tf.Example` parsing spec as dict.
2203
+
2204
+ It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is
2205
+ a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
2206
+ supported objects. Please check documentation of `tf.io.parse_example` for
2207
+ all supported spec objects.
2208
+
2209
+ Let's say a Feature column depends on raw feature ('raw') and another
2210
+ `_FeatureColumn` (input_fc). One possible implementation of
2211
+ _parse_example_spec is as follows:
2212
+
2213
+ ```python
2214
+ spec = {'raw': tf.io.FixedLenFeature(...)}
2215
+ spec.update(input_fc._parse_example_spec)
2216
+ return spec
2217
+ ```
2218
+ """
2219
+ pass
2220
+
2221
+ def _reset_config(self):
2222
+ """Resets the configuration in the column.
2223
+
2224
+ Some feature columns e.g. embedding or shared embedding columns might
2225
+ have some state that is needed to be reset sometimes. Use this method
2226
+ in that scenario.
2227
+ """
2228
+
2229
+
2230
+ class _DenseColumn(_FeatureColumn):
2231
+ """Represents a column which can be represented as `Tensor`.
2232
+
2233
+ WARNING: Do not subclass this layer unless you know what you are doing:
2234
+ the API is subject to future changes.
2235
+
2236
+ Some examples of this type are: numeric_column, embedding_column,
2237
+ indicator_column.
2238
+ """
2239
+
2240
+ @abc.abstractproperty
2241
+ def _variable_shape(self):
2242
+ """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
2243
+ pass
2244
+
2245
+ @abc.abstractmethod
2246
+ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2247
+ """Returns a `Tensor`.
2248
+
2249
+ The output of this function will be used by model-builder-functions. For
2250
+ example the pseudo code of `input_layer` will be like:
2251
+
2252
+ ```python
2253
+ def input_layer(features, feature_columns, ...):
2254
+ outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
2255
+ return tf.concat(outputs)
2256
+ ```
2257
+
2258
+ Args:
2259
+ inputs: A `_LazyBuilder` object to access inputs.
2260
+ weight_collections: List of graph collections to which Variables (if any
2261
+ will be created) are added.
2262
+ trainable: If `True` also add variables to the graph collection
2263
+ `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
2264
+
2265
+ Returns:
2266
+ `Tensor` of shape [batch_size] + `_variable_shape`.
2267
+ """
2268
+ pass
2269
+
2270
+
2271
+ def _create_weighted_sum(column,
2272
+ builder,
2273
+ units,
2274
+ sparse_combiner,
2275
+ weight_collections,
2276
+ trainable,
2277
+ weight_var=None):
2278
+ """Creates a weighted sum for a dense/categorical column for linear_model."""
2279
+ if isinstance(column, _CategoricalColumn):
2280
+ return _create_categorical_column_weighted_sum(
2281
+ column=column,
2282
+ builder=builder,
2283
+ units=units,
2284
+ sparse_combiner=sparse_combiner,
2285
+ weight_collections=weight_collections,
2286
+ trainable=trainable,
2287
+ weight_var=weight_var)
2288
+ else:
2289
+ return _create_dense_column_weighted_sum(
2290
+ column=column,
2291
+ builder=builder,
2292
+ units=units,
2293
+ weight_collections=weight_collections,
2294
+ trainable=trainable,
2295
+ weight_var=weight_var)
2296
+
2297
+
2298
+ def _create_dense_column_weighted_sum(column,
2299
+ builder,
2300
+ units,
2301
+ weight_collections,
2302
+ trainable,
2303
+ weight_var=None):
2304
+ """Create a weighted sum of a dense column for linear_model."""
2305
+ tensor = column._get_dense_tensor( # pylint: disable=protected-access
2306
+ builder,
2307
+ weight_collections=weight_collections,
2308
+ trainable=trainable)
2309
+ num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access
2310
+ batch_size = array_ops.shape(tensor)[0]
2311
+ tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
2312
+ if weight_var is not None:
2313
+ weight = weight_var
2314
+ else:
2315
+ weight = variable_scope.get_variable(
2316
+ name='weights',
2317
+ shape=[num_elements, units],
2318
+ initializer=init_ops.zeros_initializer(),
2319
+ trainable=trainable,
2320
+ collections=weight_collections)
2321
+ return math_ops.matmul(tensor, weight, name='weighted_sum')
2322
+
2323
+
2324
+ class _CategoricalColumn(_FeatureColumn):
2325
+ """Represents a categorical feature.
2326
+
2327
+ WARNING: Do not subclass this layer unless you know what you are doing:
2328
+ the API is subject to future changes.
2329
+
2330
+ A categorical feature typically handled with a `tf.SparseTensor` of IDs.
2331
+ """
2332
+
2333
+ IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name
2334
+ 'IdWeightPair', ['id_tensor', 'weight_tensor'])
2335
+
2336
+ @abc.abstractproperty
2337
+ def _num_buckets(self):
2338
+ """Returns number of buckets in this sparse feature."""
2339
+ pass
2340
+
2341
+ @abc.abstractmethod
2342
+ def _get_sparse_tensors(self,
2343
+ inputs,
2344
+ weight_collections=None,
2345
+ trainable=None):
2346
+ """Returns an IdWeightPair.
2347
+
2348
+ `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
2349
+ weights.
2350
+
2351
+ `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
2352
+ `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
2353
+ `SparseTensor` of `float` or `None` to indicate all weights should be
2354
+ taken to be 1. If specified, `weight_tensor` must have exactly the same
2355
+ shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
2356
+ output of a `VarLenFeature` which is a ragged matrix.
2357
+
2358
+ Args:
2359
+ inputs: A `LazyBuilder` as a cache to get input tensors required to
2360
+ create `IdWeightPair`.
2361
+ weight_collections: List of graph collections to which variables (if any
2362
+ will be created) are added.
2363
+ trainable: If `True` also add variables to the graph collection
2364
+ `GraphKeys.TRAINABLE_VARIABLES` (see `tf.compat.v1.get_variable`).
2365
+ """
2366
+ pass
2367
+
2368
+
2369
+ def _create_categorical_column_weighted_sum(column,
2370
+ builder,
2371
+ units,
2372
+ sparse_combiner,
2373
+ weight_collections,
2374
+ trainable,
2375
+ weight_var=None):
2376
+ # pylint: disable=g-doc-return-or-yield,g-doc-args
2377
+ """Create a weighted sum of a categorical column for linear_model.
2378
+
2379
+ Note to maintainer: As implementation details, the weighted sum is
2380
+ implemented via embedding_lookup_sparse toward efficiency. Mathematically,
2381
+ they are the same.
2382
+
2383
+ To be specific, conceptually, categorical column can be treated as multi-hot
2384
+ vector. Say:
2385
+
2386
+ ```python
2387
+ x = [0 0 1] # categorical column input
2388
+ w = [a b c] # weights
2389
+ ```
2390
+ The weighted sum is `c` in this case, which is same as `w[2]`.
2391
+
2392
+ Another example is
2393
+
2394
+ ```python
2395
+ x = [0 1 1] # categorical column input
2396
+ w = [a b c] # weights
2397
+ ```
2398
+ The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`.
2399
+
2400
+ For both cases, we can implement weighted sum via embedding_lookup with
2401
+ sparse_combiner = "sum".
2402
+ """
2403
+ sparse_tensors = column._get_sparse_tensors( # pylint: disable=protected-access
2404
+ builder,
2405
+ weight_collections=weight_collections,
2406
+ trainable=trainable)
2407
+ id_tensor = sparse_ops.sparse_reshape(
2408
+ sparse_tensors.id_tensor,
2409
+ [array_ops.shape(sparse_tensors.id_tensor)[0], -1])
2410
+ weight_tensor = sparse_tensors.weight_tensor
2411
+ if weight_tensor is not None:
2412
+ weight_tensor = sparse_ops.sparse_reshape(
2413
+ weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
2414
+
2415
+ if weight_var is not None:
2416
+ weight = weight_var
2417
+ else:
2418
+ weight = variable_scope.get_variable(
2419
+ name='weights',
2420
+ shape=(column._num_buckets, units), # pylint: disable=protected-access
2421
+ initializer=init_ops.zeros_initializer(),
2422
+ trainable=trainable,
2423
+ collections=weight_collections)
2424
+ return embedding_ops.safe_embedding_lookup_sparse(
2425
+ weight,
2426
+ id_tensor,
2427
+ sparse_weights=weight_tensor,
2428
+ combiner=sparse_combiner,
2429
+ name='weighted_sum')
2430
+
2431
+
2432
+ class _SequenceDenseColumn(_FeatureColumn):
2433
+ """Represents dense sequence data."""
2434
+
2435
+ TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name
2436
+ 'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
2437
+
2438
+ @abc.abstractmethod
2439
+ def _get_sequence_dense_tensor(self,
2440
+ inputs,
2441
+ weight_collections=None,
2442
+ trainable=None):
2443
+ """Returns a `TensorSequenceLengthPair`."""
2444
+ pass
2445
+
2446
+
2447
+ class _LazyBuilder(object):
2448
+ """Handles caching of transformations while building the model.
2449
+
2450
+ `_FeatureColumn` specifies how to digest an input column to the network. Some
2451
+ feature columns require data transformations. This class caches those
2452
+ transformations.
2453
+
2454
+ Some features may be used in more than one place. For example, one can use a
2455
+ bucketized feature by itself and a cross with it. In that case we
2456
+ should create only one bucketization op instead of creating ops for each
2457
+ feature column separately. To handle re-use of transformed columns,
2458
+ `_LazyBuilder` caches all previously transformed columns.
2459
+
2460
+ Example:
2461
+ We're trying to use the following `_FeatureColumn`s:
2462
+
2463
+ ```python
2464
+ bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
2465
+ keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
2466
+ age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
2467
+ ... = linear_model(features,
2468
+ [bucketized_age, keywords, age_X_keywords]
2469
+ ```
2470
+
2471
+ If we transform each column independently, then we'll get duplication of
2472
+ bucketization (one for cross, one for bucketization itself).
2473
+ The `_LazyBuilder` eliminates this duplication.
2474
+ """
2475
+
2476
+ def __init__(self, features):
2477
+ """Creates a `_LazyBuilder`.
2478
+
2479
+ Args:
2480
+ features: A mapping from feature column to objects that are `Tensor` or
2481
+ `SparseTensor`, or can be converted to same via
2482
+ `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
2483
+ signifies a base feature (not-transformed). A `_FeatureColumn` key
2484
+ means that this `Tensor` is the output of an existing `_FeatureColumn`
2485
+ which can be reused.
2486
+ """
2487
+ self._features = features.copy()
2488
+ self._feature_tensors = {}
2489
+
2490
+ def get(self, key):
2491
+ """Returns a `Tensor` for the given key.
2492
+
2493
+ A `str` key is used to access a base feature (not-transformed). When a
2494
+ `_FeatureColumn` is passed, the transformed feature is returned if it
2495
+ already exists, otherwise the given `_FeatureColumn` is asked to provide its
2496
+ transformed output, which is then cached.
2497
+
2498
+ Args:
2499
+ key: a `str` or a `_FeatureColumn`.
2500
+
2501
+ Returns:
2502
+ The transformed `Tensor` corresponding to the `key`.
2503
+
2504
+ Raises:
2505
+ ValueError: if key is not found or a transformed `Tensor` cannot be
2506
+ computed.
2507
+ """
2508
+ if key in self._feature_tensors:
2509
+ # FeatureColumn is already transformed or converted.
2510
+ return self._feature_tensors[key]
2511
+
2512
+ if key in self._features:
2513
+ feature_tensor = self._get_raw_feature_as_tensor(key)
2514
+ self._feature_tensors[key] = feature_tensor
2515
+ return feature_tensor
2516
+
2517
+ if isinstance(key, six.string_types):
2518
+ raise ValueError('Feature {} is not in features dictionary.'.format(key))
2519
+
2520
+ if not isinstance(key, _FeatureColumn):
2521
+ raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
2522
+ 'Provided: {}'.format(key))
2523
+
2524
+ column = key
2525
+ logging.debug('Transforming feature_column %s.', column)
2526
+ transformed = column._transform_feature(self) # pylint: disable=protected-access
2527
+ if transformed is None:
2528
+ raise ValueError('Column {} is not supported.'.format(column.name))
2529
+ self._feature_tensors[column] = transformed
2530
+ return transformed
2531
+
2532
+ def _get_raw_feature_as_tensor(self, key):
2533
+ """Gets the raw_feature (keyed by `key`) as `tensor`.
2534
+
2535
+ The raw feature is converted to (sparse) tensor and maybe expand dim.
2536
+
2537
+ For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if
2538
+ the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will
2539
+ error out as it is not supported.
2540
+
2541
+ Args:
2542
+ key: A `str` key to access the raw feature.
2543
+
2544
+ Returns:
2545
+ A `Tensor` or `SparseTensor`.
2546
+
2547
+ Raises:
2548
+ ValueError: if the raw feature has rank 0.
2549
+ """
2550
+ raw_feature = self._features[key]
2551
+ if 'RaggedTensor' in str(type(raw_feature)):
2552
+ return raw_feature
2553
+
2554
+ feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2555
+ raw_feature)
2556
+
2557
+ def expand_dims(input_tensor):
2558
+ # Input_tensor must have rank 1.
2559
+ if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2560
+ return sparse_ops.sparse_reshape(input_tensor,
2561
+ [array_ops.shape(input_tensor)[0], 1])
2562
+ else:
2563
+ return array_ops.expand_dims(input_tensor, -1)
2564
+
2565
+ rank = feature_tensor.get_shape().ndims
2566
+ if rank is not None:
2567
+ if rank == 0:
2568
+ raise ValueError(
2569
+ 'Feature (key: {}) cannot have rank 0. Give: {}'.format(
2570
+ key, feature_tensor))
2571
+ return feature_tensor if rank != 1 else expand_dims(feature_tensor)
2572
+
2573
+ # Handle dynamic rank.
2574
+ with ops.control_dependencies([
2575
+ check_ops.assert_positive(
2576
+ array_ops.rank(feature_tensor),
2577
+ message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
2578
+ key, feature_tensor))
2579
+ ]):
2580
+ return control_flow_ops.cond(
2581
+ math_ops.equal(1, array_ops.rank(feature_tensor)),
2582
+ lambda: expand_dims(feature_tensor), lambda: feature_tensor)
2583
+
2584
+
2585
+ # TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
2586
+ def _shape_offsets(shape):
2587
+ """Returns moving offset for each dimension given shape."""
2588
+ offsets = []
2589
+ for dim in reversed(shape):
2590
+ if offsets:
2591
+ offsets.append(dim * offsets[-1])
2592
+ else:
2593
+ offsets.append(dim)
2594
+ offsets.reverse()
2595
+ return offsets
2596
+
2597
+
2598
+ # TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
2599
+ def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
2600
+ """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
2601
+
2602
+ If `input_tensor` is already a `SparseTensor`, just return it.
2603
+
2604
+ Args:
2605
+ input_tensor: A string or integer `Tensor`.
2606
+ ignore_value: Entries in `dense_tensor` equal to this value will be
2607
+ absent from the resulting `SparseTensor`. If `None`, default value of
2608
+ `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
2609
+
2610
+ Returns:
2611
+ A `SparseTensor` with the same shape as `input_tensor`.
2612
+
2613
+ Raises:
2614
+ ValueError: when `input_tensor`'s rank is `None`.
2615
+ """
2616
+ input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2617
+ input_tensor)
2618
+ if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2619
+ return input_tensor
2620
+ with ops.name_scope(None, 'to_sparse_input', (
2621
+ input_tensor,
2622
+ ignore_value,
2623
+ )):
2624
+ if ignore_value is None:
2625
+ if input_tensor.dtype == dtypes.string:
2626
+ # Exception due to TF strings are converted to numpy objects by default.
2627
+ ignore_value = ''
2628
+ elif input_tensor.dtype.is_integer:
2629
+ ignore_value = -1 # -1 has a special meaning of missing feature
2630
+ else:
2631
+ # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
2632
+ # constructing a new numpy object of the given type, which yields the
2633
+ # default value for that type.
2634
+ ignore_value = input_tensor.dtype.as_numpy_dtype()
2635
+ ignore_value = math_ops.cast(
2636
+ ignore_value, input_tensor.dtype, name='ignore_value')
2637
+ indices = array_ops.where(
2638
+ math_ops.not_equal(input_tensor, ignore_value), name='indices')
2639
+ return sparse_tensor_lib.SparseTensor(
2640
+ indices=indices,
2641
+ values=array_ops.gather_nd(input_tensor, indices, name='values'),
2642
+ dense_shape=array_ops.shape(
2643
+ input_tensor, out_type=dtypes.int64, name='dense_shape'))
2644
+
2645
+
2646
+ def _normalize_feature_columns(feature_columns):
2647
+ """Normalizes the `feature_columns` input.
2648
+
2649
+ This method converts the `feature_columns` to list type as best as it can. In
2650
+ addition, verifies the type and other parts of feature_columns, required by
2651
+ downstream library.
2652
+
2653
+ Args:
2654
+ feature_columns: The raw feature columns, usually passed by users.
2655
+
2656
+ Returns:
2657
+ The normalized feature column list.
2658
+
2659
+ Raises:
2660
+ ValueError: for any invalid inputs, such as empty, duplicated names, etc.
2661
+ """
2662
+ if isinstance(feature_columns, _FeatureColumn):
2663
+ feature_columns = [feature_columns]
2664
+
2665
+ # if isinstance(feature_columns, collections.Iterator):
2666
+ # feature_columns = list(feature_columns)
2667
+
2668
+ if isinstance(feature_columns, dict):
2669
+ raise ValueError('Expected feature_columns to be iterable, found dict.')
2670
+
2671
+ for column in feature_columns:
2672
+ if not isinstance(column, _FeatureColumn):
2673
+ raise ValueError('Items of feature_columns must be a _FeatureColumn. '
2674
+ 'Given (type {}): {}.'.format(type(column), column))
2675
+ if not feature_columns:
2676
+ raise ValueError('feature_columns must not be empty.')
2677
+ name_to_column = {}
2678
+ for column in feature_columns:
2679
+ if column.name in name_to_column:
2680
+ raise ValueError('Duplicate feature column name found for columns: {} '
2681
+ 'and {}. This usually means that these columns refer to '
2682
+ 'same base feature. Either one must be discarded or a '
2683
+ 'duplicated but renamed item must be inserted in '
2684
+ 'features dict.'.format(column,
2685
+ name_to_column[column.name]))
2686
+ name_to_column[column.name] = column
2687
+
2688
+ return feature_columns
2689
+
2690
+
2691
+ class _NumericColumn(
2692
+ _DenseColumn,
2693
+ collections.namedtuple(
2694
+ '_NumericColumn',
2695
+ ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
2696
+ """see `numeric_column`."""
2697
+
2698
+ @property
2699
+ def name(self):
2700
+ return self.key
2701
+
2702
+ @property
2703
+ def _parse_example_spec(self):
2704
+ return {
2705
+ self.key:
2706
+ parsing_ops.FixedLenFeature(self.shape, self.dtype,
2707
+ self.default_value)
2708
+ }
2709
+
2710
+ def _transform_feature(self, inputs):
2711
+ input_tensor = inputs.get(self.key)
2712
+ if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2713
+ raise ValueError(
2714
+ 'The corresponding Tensor of numerical column must be a Tensor. '
2715
+ 'SparseTensor is not supported. key: {}'.format(self.key))
2716
+ if self.normalizer_fn is not None:
2717
+ input_tensor = self.normalizer_fn(input_tensor)
2718
+ return math_ops.cast(input_tensor, dtypes.float32)
2719
+
2720
+ @property
2721
+ def _variable_shape(self):
2722
+ return tensor_shape.TensorShape(self.shape)
2723
+
2724
+ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2725
+ """Returns dense `Tensor` representing numeric feature.
2726
+
2727
+ Args:
2728
+ inputs: A `_LazyBuilder` object to access inputs.
2729
+ weight_collections: Unused `weight_collections` since no variables are
2730
+ created in this function.
2731
+ trainable: Unused `trainable` bool since no variables are created in
2732
+ this function.
2733
+
2734
+ Returns:
2735
+ Dense `Tensor` created within `_transform_feature`.
2736
+ """
2737
+ # Do nothing with weight_collections and trainable since no variables are
2738
+ # created in this function.
2739
+ del weight_collections
2740
+ del trainable
2741
+ # Feature has been already transformed. Return the intermediate
2742
+ # representation created by _transform_feature.
2743
+ return inputs.get(self)
2744
+
2745
+
2746
+ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
2747
+ collections.namedtuple('_BucketizedColumn',
2748
+ ['source_column', 'boundaries'])
2749
+ ):
2750
+ """See `bucketized_column`."""
2751
+
2752
+ @property
2753
+ def name(self):
2754
+ return '{}_bucketized'.format(self.source_column.name)
2755
+
2756
+ @property
2757
+ def raw_name(self):
2758
+ return self.source_column.raw_name
2759
+
2760
+ @property
2761
+ def _parse_example_spec(self):
2762
+ return self.source_column._parse_example_spec # pylint: disable=protected-access
2763
+
2764
+ def _transform_feature(self, inputs):
2765
+ source_tensor = inputs.get(self.source_column)
2766
+ return math_ops._bucketize( # pylint: disable=protected-access
2767
+ source_tensor,
2768
+ boundaries=self.boundaries)
2769
+
2770
+ @property
2771
+ def _variable_shape(self):
2772
+ return tensor_shape.TensorShape(
2773
+ tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
2774
+
2775
+ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2776
+ del weight_collections
2777
+ del trainable
2778
+ input_tensor = inputs.get(self)
2779
+ return array_ops.one_hot(
2780
+ indices=math_ops.cast(input_tensor, dtypes.int64),
2781
+ depth=len(self.boundaries) + 1,
2782
+ on_value=1.,
2783
+ off_value=0.)
2784
+
2785
+ @property
2786
+ def _num_buckets(self):
2787
+ # By construction, source_column is always one-dimensional.
2788
+ return (len(self.boundaries) + 1) * self.source_column.shape[0]
2789
+
2790
+ def _get_sparse_tensors(self,
2791
+ inputs,
2792
+ weight_collections=None,
2793
+ trainable=None):
2794
+ """Converts dense inputs to SparseTensor so downstream code can use it."""
2795
+ input_tensor = inputs.get(self)
2796
+ batch_size = array_ops.shape(input_tensor)[0]
2797
+ # By construction, source_column is always one-dimensional.
2798
+ source_dimension = self.source_column.shape[0]
2799
+
2800
+ i1 = array_ops.reshape(
2801
+ array_ops.tile(
2802
+ array_ops.expand_dims(math_ops.range(0, batch_size), 1),
2803
+ [1, source_dimension]), (-1,))
2804
+ i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
2805
+ # Flatten the bucket indices and unique them across dimensions
2806
+ # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
2807
+ bucket_indices = (
2808
+ array_ops.reshape(input_tensor,
2809
+ (-1,)) + (len(self.boundaries) + 1) * i2)
2810
+
2811
+ indices = math_ops.cast(
2812
+ array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
2813
+ dense_shape = math_ops.cast(
2814
+ array_ops.stack([batch_size, source_dimension]), dtypes.int64)
2815
+ sparse_tensor = sparse_tensor_lib.SparseTensor(
2816
+ indices=indices, values=bucket_indices, dense_shape=dense_shape)
2817
+ return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
2818
+
2819
+
2820
+ class _EmbeddingColumn(
2821
+ _DenseColumn, _SequenceDenseColumn,
2822
+ collections.namedtuple(
2823
+ '_EmbeddingColumn',
2824
+ ('categorical_column', 'dimension', 'combiner', 'layer_creator',
2825
+ 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
2826
+ """See `embedding_column`."""
2827
+
2828
+ @property
2829
+ def name(self):
2830
+ if not hasattr(self, '_name'):
2831
+ self._name = '{}_embedding'.format(self.categorical_column.name)
2832
+ return self._name
2833
+
2834
+ @property
2835
+ def _parse_example_spec(self):
2836
+ return self.categorical_column._parse_example_spec # pylint: disable=protected-access
2837
+
2838
+ def _transform_feature(self, inputs):
2839
+ return inputs.get(self.categorical_column)
2840
+
2841
+ @property
2842
+ def _variable_shape(self):
2843
+ if not hasattr(self, '_shape'):
2844
+ self._shape = tensor_shape.TensorShape([self.dimension])
2845
+ return self._shape
2846
+
2847
+ def _get_dense_tensor_internal(self,
2848
+ inputs,
2849
+ weight_collections=None,
2850
+ trainable=None):
2851
+ """Private method that follows the signature of _get_dense_tensor."""
2852
+ # Get sparse IDs and weights.
2853
+ sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access
2854
+ inputs,
2855
+ weight_collections=weight_collections,
2856
+ trainable=trainable)
2857
+ sparse_ids = sparse_tensors.id_tensor
2858
+ sparse_weights = sparse_tensors.weight_tensor
2859
+
2860
+ embedding_weights = self.layer_creator(
2861
+ weight_collections=weight_collections,
2862
+ scope=variable_scope.get_variable_scope())
2863
+
2864
+ if self.ckpt_to_load_from is not None:
2865
+ to_restore = embedding_weights
2866
+ if isinstance(to_restore, variables.PartitionedVariable):
2867
+ to_restore = to_restore._get_variable_list() # pylint: disable=protected-access
2868
+ checkpoint_utils.init_from_checkpoint(
2869
+ self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
2870
+
2871
+ # Return embedding lookup result.
2872
+ return embedding_ops.safe_embedding_lookup_sparse(
2873
+ embedding_weights=embedding_weights,
2874
+ sparse_ids=sparse_ids,
2875
+ sparse_weights=sparse_weights,
2876
+ combiner=self.combiner,
2877
+ name='%s_weights' % self.name,
2878
+ max_norm=self.max_norm)
2879
+
2880
+ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2881
+ if isinstance(self.categorical_column, _SequenceCategoricalColumn):
2882
+ raise ValueError(
2883
+ 'In embedding_column: {}. '
2884
+ 'categorical_column must not be of type _SequenceCategoricalColumn. '
2885
+ 'Suggested fix A: If you wish to use input_layer, use a '
2886
+ 'non-sequence categorical_column_with_*. '
2887
+ 'Suggested fix B: If you wish to create sequence input, use '
2888
+ 'sequence_input_layer instead of input_layer. '
2889
+ 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2890
+ self.categorical_column))
2891
+ return self._get_dense_tensor_internal(
2892
+ inputs=inputs,
2893
+ weight_collections=weight_collections,
2894
+ trainable=trainable)
2895
+
2896
+ def _get_sequence_dense_tensor(self,
2897
+ inputs,
2898
+ weight_collections=None,
2899
+ trainable=None):
2900
+ if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
2901
+ raise ValueError(
2902
+ 'In embedding_column: {}. '
2903
+ 'categorical_column must be of type _SequenceCategoricalColumn '
2904
+ 'to use sequence_input_layer. '
2905
+ 'Suggested fix: Use one of sequence_categorical_column_with_*. '
2906
+ 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2907
+ self.categorical_column))
2908
+ dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access
2909
+ inputs=inputs,
2910
+ weight_collections=weight_collections,
2911
+ trainable=trainable)
2912
+
2913
+ sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
2914
+ sequence_length = fc_utils.sequence_length_from_sparse_tensor(
2915
+ sparse_tensors.id_tensor)
2916
+ return _SequenceDenseColumn.TensorSequenceLengthPair(
2917
+ dense_tensor=dense_tensor, sequence_length=sequence_length)
2918
+
2919
+
2920
+ def _get_graph_for_variable(var):
2921
+ if isinstance(var, variables.PartitionedVariable):
2922
+ return list(var)[0].graph
2923
+ else:
2924
+ return var.graph
2925
+
2926
+
2927
+ class _SharedEmbeddingColumn(
2928
+ _DenseColumn, _SequenceDenseColumn,
2929
+ collections.namedtuple(
2930
+ '_SharedEmbeddingColumn',
2931
+ ('categorical_column', 'dimension', 'combiner', 'initializer',
2932
+ 'shared_embedding_collection_name', 'ckpt_to_load_from',
2933
+ 'tensor_name_in_ckpt', 'max_norm', 'trainable', 'partitioner',
2934
+ 'ev_params'))):
2935
+ """See `embedding_column`."""
2936
+
2937
+ @property
2938
+ def name(self):
2939
+ if not hasattr(self, '_name'):
2940
+ self._name = '{}_shared_embedding'.format(self.categorical_column.name)
2941
+ return self._name
2942
+
2943
+ @property
2944
+ def raw_name(self):
2945
+ return self.categorical_column.name
2946
+
2947
+ @property
2948
+ def _var_scope_name(self):
2949
+ return self.shared_embedding_collection_name
2950
+
2951
+ @property
2952
+ def _parse_example_spec(self):
2953
+ return self.categorical_column._parse_example_spec # pylint: disable=protected-access
2954
+
2955
+ def _transform_feature(self, inputs):
2956
+ return inputs.get(self.categorical_column)
2957
+
2958
+ @property
2959
+ def _variable_shape(self):
2960
+ if not hasattr(self, '_shape'):
2961
+ self._shape = tensor_shape.TensorShape([self.dimension])
2962
+ return self._shape
2963
+
2964
+ def _get_dense_tensor_internal(self,
2965
+ inputs,
2966
+ weight_collections=None,
2967
+ trainable=None):
2968
+ """Private method that follows the signature of _get_dense_tensor."""
2969
+ # This method is called from a variable_scope with name _var_scope_name,
2970
+ # which is shared among all shared embeddings. Open a name_scope here, so
2971
+ # that the ops for different columns have distinct names.
2972
+ with ops.name_scope(None, default_name=self.name):
2973
+ # Get sparse IDs and weights.
2974
+ sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access
2975
+ inputs,
2976
+ weight_collections=weight_collections,
2977
+ trainable=trainable)
2978
+ sparse_ids = sparse_tensors.id_tensor
2979
+ sparse_weights = sparse_tensors.weight_tensor
2980
+
2981
+ embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access
2982
+ shared_embedding_collection = ops.get_collection(
2983
+ self.shared_embedding_collection_name)
2984
+ if shared_embedding_collection:
2985
+ if len(shared_embedding_collection) > 1:
2986
+ raise ValueError(
2987
+ 'Collection {} can only contain one variable. '
2988
+ 'Suggested fix A: Choose a unique name for this collection. '
2989
+ 'Suggested fix B: Do not add any variables to this collection. '
2990
+ 'The feature_column library already adds a variable under the '
2991
+ 'hood.'.format(shared_embedding_collection))
2992
+ embedding_weights = shared_embedding_collection[0]
2993
+ if embedding_weights.get_shape(
2994
+ ) != embedding_shape and not self.ev_params is not None: # noqa : E714
2995
+ raise ValueError(
2996
+ 'Shared embedding collection {} contains variable {} of '
2997
+ 'unexpected shape {}. Expected shape is {}. '
2998
+ 'Suggested fix A: Choose a unique name for this collection. '
2999
+ 'Suggested fix B: Do not add any variables to this collection. '
3000
+ 'The feature_column library already adds a variable under the '
3001
+ 'hood.'.format(self.shared_embedding_collection_name,
3002
+ embedding_weights.name,
3003
+ embedding_weights.get_shape(), embedding_shape))
3004
+ else:
3005
+ if self.ev_params is None:
3006
+ embedding_weights = variable_scope.get_variable(
3007
+ name='embedding_weights',
3008
+ shape=embedding_shape,
3009
+ dtype=dtypes.float32,
3010
+ initializer=self.initializer,
3011
+ trainable=self.trainable and trainable,
3012
+ partitioner=self.partitioner,
3013
+ collections=weight_collections)
3014
+ else:
3015
+ # at eval or inference time, it is necessary to set
3016
+ # the initializers to zeros, so that new key will
3017
+ # get zero embedding
3018
+ if os.environ.get('tf.estimator.mode', '') != \
3019
+ os.environ.get('tf.estimator.ModeKeys.TRAIN', 'train'):
3020
+ initializer = init_ops.zeros_initializer()
3021
+ else:
3022
+ initializer = self.initializer
3023
+ extra_args = {}
3024
+ if 'EmbeddingVariableConfig' in dir(variables):
3025
+ ev_option = variables.EmbeddingVariableOption()
3026
+ ev_option.filter_strategy = variables.CounterFilter(
3027
+ filter_freq=self.ev_params.filter_freq)
3028
+ extra_args['ev_option'] = ev_option
3029
+ else:
3030
+ extra_args['filter_options'] = variables.CounterFilterOptions(
3031
+ self.ev_params.filter_freq)
3032
+ embedding_weights = variable_scope.get_embedding_variable(
3033
+ name='embedding_weights',
3034
+ embedding_dim=self.dimension,
3035
+ initializer=initializer,
3036
+ trainable=self.trainable and trainable,
3037
+ partitioner=self.partitioner,
3038
+ collections=weight_collections,
3039
+ steps_to_live=self.ev_params.steps_to_live,
3040
+ **extra_args)
3041
+
3042
+ ops.add_to_collection(self.shared_embedding_collection_name,
3043
+ embedding_weights)
3044
+ if self.ckpt_to_load_from is not None:
3045
+ to_restore = embedding_weights
3046
+ if isinstance(to_restore, variables.PartitionedVariable):
3047
+ to_restore = to_restore._get_variable_list() # pylint: disable=protected-access
3048
+ checkpoint_utils.init_from_checkpoint(
3049
+ self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
3050
+
3051
+ if 'RaggedTensor' in str(type(sparse_ids)):
3052
+ assert sparse_weights is None
3053
+ return embedding_lookup_ragged(
3054
+ embedding_weights=embedding_weights,
3055
+ ragged_ids=sparse_ids,
3056
+ ragged_weights=sparse_weights,
3057
+ combiner=self.combiner,
3058
+ max_norm=self.max_norm,
3059
+ name='%s_weights' % self.name)
3060
+
3061
+ # Return embedding lookup result.
3062
+ return embedding_ops.safe_embedding_lookup_sparse(
3063
+ embedding_weights=embedding_weights,
3064
+ sparse_ids=sparse_ids,
3065
+ sparse_weights=sparse_weights,
3066
+ combiner=self.combiner,
3067
+ name='%s_weights' % self.name,
3068
+ max_norm=self.max_norm)
3069
+
3070
+ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
3071
+ if isinstance(self.categorical_column, _SequenceCategoricalColumn):
3072
+ raise ValueError(
3073
+ 'In embedding_column: {}. '
3074
+ 'categorical_column must not be of type _SequenceCategoricalColumn. '
3075
+ 'Suggested fix A: If you wish to use input_layer, use a '
3076
+ 'non-sequence categorical_column_with_*. '
3077
+ 'Suggested fix B: If you wish to create sequence input, use '
3078
+ 'sequence_input_layer instead of input_layer. '
3079
+ 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
3080
+ self.categorical_column))
3081
+ return self._get_dense_tensor_internal(
3082
+ inputs=inputs,
3083
+ weight_collections=weight_collections,
3084
+ trainable=trainable)
3085
+
3086
+ def _get_sequence_dense_tensor(self,
3087
+ inputs,
3088
+ weight_collections=None,
3089
+ trainable=None):
3090
+ if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
3091
+ raise ValueError(
3092
+ 'In embedding_column: {}. '
3093
+ 'categorical_column must be of type _SequenceCategoricalColumn '
3094
+ 'to use sequence_input_layer. '
3095
+ 'Suggested fix: Use one of sequence_categorical_column_with_*. '
3096
+ 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
3097
+ self.categorical_column))
3098
+ dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access
3099
+ inputs=inputs,
3100
+ weight_collections=weight_collections,
3101
+ trainable=trainable)
3102
+ sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
3103
+ sequence_length = fc_utils.sequence_length_from_sparse_tensor(
3104
+ sparse_tensors.id_tensor)
3105
+ return _SequenceDenseColumn.TensorSequenceLengthPair(
3106
+ dense_tensor=dense_tensor, sequence_length=sequence_length)
3107
+
3108
+
3109
+ def _check_shape(shape, key):
3110
+ """Returns shape if it's valid, raises error otherwise."""
3111
+ assert shape is not None
3112
+ if not nest.is_sequence(shape):
3113
+ shape = [shape]
3114
+ shape = tuple(shape)
3115
+ for dimension in shape:
3116
+ if not isinstance(dimension, six.integer_types):
3117
+ raise TypeError('shape dimensions must be integer. '
3118
+ 'shape: {}, key: {}'.format(shape, key))
3119
+ if dimension < 1:
3120
+ raise ValueError('shape dimensions must be greater than 0. '
3121
+ 'shape: {}, key: {}'.format(shape, key))
3122
+ return shape
3123
+
3124
+
3125
+ class _HashedCategoricalColumn(_CategoricalColumn,
3126
+ collections.namedtuple(
3127
+ '_HashedCategoricalColumn',
3128
+ ['key', 'hash_bucket_size', 'dtype'])):
3129
+ """see `categorical_column_with_hash_bucket`."""
3130
+
3131
+ @property
3132
+ def name(self):
3133
+ return self.key
3134
+
3135
+ @property
3136
+ def raw_name(self):
3137
+ return self.key
3138
+
3139
+ @property
3140
+ def _parse_example_spec(self):
3141
+ return {self.key: parsing_ops.VarLenFeature(self.dtype)}
3142
+
3143
+ def _transform_feature(self, inputs):
3144
+ input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
3145
+ if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
3146
+ raise ValueError('SparseColumn input must be a SparseTensor.')
3147
+
3148
+ fc_utils.assert_string_or_int(
3149
+ input_tensor.dtype,
3150
+ prefix='column_name: {} input_tensor'.format(self.key))
3151
+
3152
+ if self.dtype.is_integer != input_tensor.dtype.is_integer:
3153
+ raise ValueError(
3154
+ 'Column dtype and SparseTensors dtype must be compatible. '
3155
+ 'key: {}, column dtype: {}, tensor dtype: {}'.format(
3156
+ self.key, self.dtype, input_tensor.dtype))
3157
+
3158
+ if self.dtype == dtypes.string:
3159
+ sparse_values = input_tensor.values
3160
+ else:
3161
+ sparse_values = string_ops.as_string(input_tensor.values)
3162
+
3163
+ sparse_id_values = string_ops.string_to_hash_bucket_fast(
3164
+ sparse_values, self.hash_bucket_size, name='lookup')
3165
+ return sparse_tensor_lib.SparseTensor(input_tensor.indices,
3166
+ sparse_id_values,
3167
+ input_tensor.dense_shape)
3168
+
3169
+ @property
3170
+ def _num_buckets(self):
3171
+ """Returns number of buckets in this sparse feature."""
3172
+ return self.hash_bucket_size
3173
+
3174
+ def _get_sparse_tensors(self,
3175
+ inputs,
3176
+ weight_collections=None,
3177
+ trainable=None):
3178
+ return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
3179
+
3180
+
3181
+ class _VocabularyFileCategoricalColumn(
3182
+ _CategoricalColumn,
3183
+ collections.namedtuple('_VocabularyFileCategoricalColumn',
3184
+ ('key', 'vocabulary_file', 'vocabulary_size',
3185
+ 'num_oov_buckets', 'dtype', 'default_value'))):
3186
+ """See `categorical_column_with_vocabulary_file`."""
3187
+
3188
+ @property
3189
+ def name(self):
3190
+ return self.key
3191
+
3192
+ @property
3193
+ def _parse_example_spec(self):
3194
+ return {self.key: parsing_ops.VarLenFeature(self.dtype)}
3195
+
3196
+ def _transform_feature(self, inputs):
3197
+ input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
3198
+
3199
+ if self.dtype.is_integer != input_tensor.dtype.is_integer:
3200
+ raise ValueError(
3201
+ 'Column dtype and SparseTensors dtype must be compatible. '
3202
+ 'key: {}, column dtype: {}, tensor dtype: {}'.format(
3203
+ self.key, self.dtype, input_tensor.dtype))
3204
+
3205
+ fc_utils.assert_string_or_int(
3206
+ input_tensor.dtype,
3207
+ prefix='column_name: {} input_tensor'.format(self.key))
3208
+
3209
+ key_dtype = self.dtype
3210
+ if input_tensor.dtype.is_integer:
3211
+ # `index_table_from_file` requires 64-bit integer keys.
3212
+ key_dtype = dtypes.int64
3213
+ input_tensor = math_ops.cast(input_tensor, dtypes.int64)
3214
+
3215
+ return lookup_ops.index_table_from_file(
3216
+ vocabulary_file=self.vocabulary_file,
3217
+ num_oov_buckets=self.num_oov_buckets,
3218
+ vocab_size=self.vocabulary_size,
3219
+ default_value=self.default_value,
3220
+ key_dtype=key_dtype,
3221
+ name='{}_lookup'.format(self.key)).lookup(input_tensor)
3222
+
3223
+ @property
3224
+ def _num_buckets(self):
3225
+ """Returns number of buckets in this sparse feature."""
3226
+ return self.vocabulary_size + self.num_oov_buckets
3227
+
3228
+ def _get_sparse_tensors(self,
3229
+ inputs,
3230
+ weight_collections=None,
3231
+ trainable=None):
3232
+ return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
3233
+
3234
+
3235
+ class _VocabularyListCategoricalColumn(
3236
+ _CategoricalColumn,
3237
+ collections.namedtuple(
3238
+ '_VocabularyListCategoricalColumn',
3239
+ ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))
3240
+ ):
3241
+ """See `categorical_column_with_vocabulary_list`."""
3242
+
3243
+ @property
3244
+ def name(self):
3245
+ return self.key
3246
+
3247
+ @property
3248
+ def _parse_example_spec(self):
3249
+ return {self.key: parsing_ops.VarLenFeature(self.dtype)}
3250
+
3251
+ def _transform_feature(self, inputs):
3252
+ input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
3253
+
3254
+ if self.dtype.is_integer != input_tensor.dtype.is_integer:
3255
+ raise ValueError(
3256
+ 'Column dtype and SparseTensors dtype must be compatible. '
3257
+ 'key: {}, column dtype: {}, tensor dtype: {}'.format(
3258
+ self.key, self.dtype, input_tensor.dtype))
3259
+
3260
+ fc_utils.assert_string_or_int(
3261
+ input_tensor.dtype,
3262
+ prefix='column_name: {} input_tensor'.format(self.key))
3263
+
3264
+ key_dtype = self.dtype
3265
+ if input_tensor.dtype.is_integer:
3266
+ # `index_table_from_tensor` requires 64-bit integer keys.
3267
+ key_dtype = dtypes.int64
3268
+ input_tensor = math_ops.cast(input_tensor, dtypes.int64)
3269
+
3270
+ return lookup_ops.index_table_from_tensor(
3271
+ vocabulary_list=tuple(self.vocabulary_list),
3272
+ default_value=self.default_value,
3273
+ num_oov_buckets=self.num_oov_buckets,
3274
+ dtype=key_dtype,
3275
+ name='{}_lookup'.format(self.key)).lookup(input_tensor)
3276
+
3277
+ @property
3278
+ def _num_buckets(self):
3279
+ """Returns number of buckets in this sparse feature."""
3280
+ return len(self.vocabulary_list) + self.num_oov_buckets
3281
+
3282
+ def _get_sparse_tensors(self,
3283
+ inputs,
3284
+ weight_collections=None,
3285
+ trainable=None):
3286
+ return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
3287
+
3288
+
3289
+ class _IdentityCategoricalColumn(_CategoricalColumn,
3290
+ collections.namedtuple(
3291
+ '_IdentityCategoricalColumn',
3292
+ ('key', 'num_buckets', 'default_value'))):
3293
+ """See `categorical_column_with_identity`."""
3294
+
3295
+ @property
3296
+ def name(self):
3297
+ return self.key
3298
+
3299
+ @property
3300
+ def _parse_example_spec(self):
3301
+ return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
3302
+
3303
+ def _transform_feature(self, inputs):
3304
+ input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
3305
+
3306
+ if not input_tensor.dtype.is_integer:
3307
+ raise ValueError('Invalid input, not integer. key: {} dtype: {}'.format(
3308
+ self.key, input_tensor.dtype))
3309
+
3310
+ values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
3311
+ num_buckets = math_ops.cast(
3312
+ self.num_buckets, dtypes.int64, name='num_buckets')
3313
+ zero = math_ops.cast(0, dtypes.int64, name='zero')
3314
+ if self.default_value is None:
3315
+ # Fail if values are out-of-range.
3316
+ assert_less = check_ops.assert_less(
3317
+ values,
3318
+ num_buckets,
3319
+ data=(values, num_buckets),
3320
+ name='assert_less_than_num_buckets')
3321
+ assert_greater = check_ops.assert_greater_equal(
3322
+ values, zero, data=(values,), name='assert_greater_or_equal_0')
3323
+ with ops.control_dependencies((assert_less, assert_greater)):
3324
+ values = array_ops.identity(values)
3325
+ else:
3326
+ # Assign default for out-of-range values.
3327
+ values = array_ops.where(
3328
+ math_ops.logical_or(
3329
+ values < zero, values >= num_buckets, name='out_of_range'),
3330
+ array_ops.fill(
3331
+ dims=array_ops.shape(values),
3332
+ value=math_ops.cast(self.default_value, dtypes.int64),
3333
+ name='default_values'), values)
3334
+
3335
+ return sparse_tensor_lib.SparseTensor(
3336
+ indices=input_tensor.indices,
3337
+ values=values,
3338
+ dense_shape=input_tensor.dense_shape)
3339
+
3340
+ @property
3341
+ def _num_buckets(self):
3342
+ """Returns number of buckets in this sparse feature."""
3343
+ return self.num_buckets
3344
+
3345
+ def _get_sparse_tensors(self,
3346
+ inputs,
3347
+ weight_collections=None,
3348
+ trainable=None):
3349
+ return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
3350
+
3351
+
3352
+ class _WeightedCategoricalColumn(
3353
+ _CategoricalColumn,
3354
+ collections.namedtuple(
3355
+ '_WeightedCategoricalColumn',
3356
+ ('categorical_column', 'weight_feature_key', 'dtype'))):
3357
+ """See `weighted_categorical_column`."""
3358
+
3359
+ @property
3360
+ def name(self):
3361
+ return '{}_weighted_by_{}'.format(self.categorical_column.name,
3362
+ self.weight_feature_key)
3363
+
3364
+ @property
3365
+ def _parse_example_spec(self):
3366
+ config = self.categorical_column._parse_example_spec # pylint: disable=protected-access
3367
+ if self.weight_feature_key in config:
3368
+ raise ValueError('Parse config {} already exists for {}.'.format(
3369
+ config[self.weight_feature_key], self.weight_feature_key))
3370
+ config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
3371
+ return config
3372
+
3373
+ @property
3374
+ def _num_buckets(self):
3375
+ return self.categorical_column._num_buckets # pylint: disable=protected-access
3376
+
3377
+ def _transform_feature(self, inputs):
3378
+ weight_tensor = inputs.get(self.weight_feature_key)
3379
+ if weight_tensor is None:
3380
+ raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
3381
+ weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
3382
+ weight_tensor)
3383
+ if self.dtype != weight_tensor.dtype.base_dtype:
3384
+ raise ValueError('Bad dtype, expected {}, but got {}.'.format(
3385
+ self.dtype, weight_tensor.dtype))
3386
+ if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
3387
+ # The weight tensor can be a regular Tensor. In this case, sparsify it.
3388
+ weight_tensor = _to_sparse_input_and_drop_ignore_values(
3389
+ weight_tensor, ignore_value=0.0)
3390
+ if not weight_tensor.dtype.is_floating:
3391
+ weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
3392
+ return (inputs.get(self.categorical_column), weight_tensor)
3393
+
3394
+ def _get_sparse_tensors(self,
3395
+ inputs,
3396
+ weight_collections=None,
3397
+ trainable=None):
3398
+ del weight_collections
3399
+ del trainable
3400
+ tensors = inputs.get(self)
3401
+ return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
3402
+
3403
+
3404
+ class _CrossedColumn(
3405
+ _CategoricalColumn,
3406
+ collections.namedtuple('_CrossedColumn',
3407
+ ['keys', 'hash_bucket_size', 'hash_key'])):
3408
+ """See `crossed_column`."""
3409
+
3410
+ @property
3411
+ def name(self):
3412
+ feature_names = []
3413
+ for key in _collect_leaf_level_keys(self):
3414
+ if isinstance(key, _FeatureColumn):
3415
+ feature_names.append(key.name)
3416
+ else: # key must be a string
3417
+ feature_names.append(key)
3418
+ return '_X_'.join(sorted(feature_names))
3419
+
3420
+ @property
3421
+ def _parse_example_spec(self):
3422
+ config = {}
3423
+ for key in self.keys:
3424
+ if isinstance(key, _FeatureColumn):
3425
+ config.update(key._parse_example_spec) # pylint: disable=protected-access
3426
+ else: # key must be a string
3427
+ config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
3428
+ return config
3429
+
3430
+ def _transform_feature(self, inputs):
3431
+ feature_tensors = []
3432
+ for key in _collect_leaf_level_keys(self):
3433
+ if isinstance(key, six.string_types):
3434
+ feature_tensors.append(inputs.get(key))
3435
+ elif isinstance(key, _CategoricalColumn):
3436
+ ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access
3437
+ if ids_and_weights.weight_tensor is not None:
3438
+ raise ValueError(
3439
+ 'crossed_column does not support weight_tensor, but the given '
3440
+ 'column populates weight_tensor. '
3441
+ 'Given column: {}'.format(key.name))
3442
+ feature_tensors.append(ids_and_weights.id_tensor)
3443
+ else:
3444
+ raise ValueError('Unsupported column type. Given: {}'.format(key))
3445
+ return sparse_ops.sparse_cross_hashed(
3446
+ inputs=feature_tensors,
3447
+ num_buckets=self.hash_bucket_size,
3448
+ hash_key=self.hash_key)
3449
+
3450
+ @property
3451
+ def _num_buckets(self):
3452
+ """Returns number of buckets in this sparse feature."""
3453
+ return self.hash_bucket_size
3454
+
3455
+ def _get_sparse_tensors(self,
3456
+ inputs,
3457
+ weight_collections=None,
3458
+ trainable=None):
3459
+ return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
3460
+
3461
+
3462
+ def _collect_leaf_level_keys(cross):
3463
+ """Collects base keys by expanding all nested crosses.
3464
+
3465
+ Args:
3466
+ cross: A `_CrossedColumn`.
3467
+
3468
+ Returns:
3469
+ A list of strings or `_CategoricalColumn` instances.
3470
+ """
3471
+ leaf_level_keys = []
3472
+ for k in cross.keys:
3473
+ if isinstance(k, _CrossedColumn):
3474
+ leaf_level_keys.extend(_collect_leaf_level_keys(k))
3475
+ else:
3476
+ leaf_level_keys.append(k)
3477
+ return leaf_level_keys
3478
+
3479
+
3480
+ class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
3481
+ collections.namedtuple('_IndicatorColumn',
3482
+ ['categorical_column'])):
3483
+ """Represents a one-hot column for use in deep networks.
3484
+
3485
+ Args:
3486
+ categorical_column: A `_CategoricalColumn` which is created by
3487
+ `categorical_column_with_*` function.
3488
+ """
3489
+
3490
+ @property
3491
+ def name(self):
3492
+ return '{}_indicator'.format(self.categorical_column.name)
3493
+
3494
+ def _transform_feature(self, inputs):
3495
+ """Returns dense `Tensor` representing feature.
3496
+
3497
+ Args:
3498
+ inputs: A `_LazyBuilder` object to access inputs.
3499
+
3500
+ Returns:
3501
+ Transformed feature `Tensor`.
3502
+
3503
+ Raises:
3504
+ ValueError: if input rank is not known at graph building time.
3505
+ """
3506
+ id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
3507
+ id_tensor = id_weight_pair.id_tensor
3508
+ weight_tensor = id_weight_pair.weight_tensor
3509
+
3510
+ # If the underlying column is weighted, return the input as a dense tensor.
3511
+ if weight_tensor is not None:
3512
+ weighted_column = sparse_ops.sparse_merge(
3513
+ sp_ids=id_tensor,
3514
+ sp_values=weight_tensor,
3515
+ vocab_size=int(self._variable_shape[-1]))
3516
+ # Remove (?, -1) index.
3517
+ weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
3518
+ weighted_column.dense_shape)
3519
+ # Use scatter_nd to merge duplicated indices if existed,
3520
+ # instead of sparse_tensor_to_dense.
3521
+ return array_ops.scatter_nd(weighted_column.indices,
3522
+ weighted_column.values,
3523
+ weighted_column.dense_shape)
3524
+
3525
+ dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
3526
+ id_tensor, default_value=-1)
3527
+
3528
+ # One hot must be float for tf.concat reasons since all other inputs to
3529
+ # input_layer are float32.
3530
+ one_hot_id_tensor = array_ops.one_hot(
3531
+ dense_id_tensor,
3532
+ depth=self._variable_shape[-1],
3533
+ on_value=1.0,
3534
+ off_value=0.0)
3535
+
3536
+ # Reduce to get a multi-hot per example.
3537
+ return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
3538
+
3539
+ @property
3540
+ def _parse_example_spec(self):
3541
+ return self.categorical_column._parse_example_spec # pylint: disable=protected-access
3542
+
3543
+ @property
3544
+ def _variable_shape(self):
3545
+ """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
3546
+ return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access
3547
+
3548
+ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
3549
+ """Returns dense `Tensor` representing feature.
3550
+
3551
+ Args:
3552
+ inputs: A `_LazyBuilder` object to access inputs.
3553
+ weight_collections: Unused `weight_collections` since no variables are
3554
+ created in this function.
3555
+ trainable: Unused `trainable` bool since no variables are created in
3556
+ this function.
3557
+
3558
+ Returns:
3559
+ Dense `Tensor` created within `_transform_feature`.
3560
+
3561
+ Raises:
3562
+ ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`.
3563
+ """
3564
+ # Do nothing with weight_collections and trainable since no variables are
3565
+ # created in this function.
3566
+ del weight_collections
3567
+ del trainable
3568
+ if isinstance(self.categorical_column, _SequenceCategoricalColumn):
3569
+ raise ValueError(
3570
+ 'In indicator_column: {}. '
3571
+ 'categorical_column must not be of type _SequenceCategoricalColumn. '
3572
+ 'Suggested fix A: If you wish to use input_layer, use a '
3573
+ 'non-sequence categorical_column_with_*. '
3574
+ 'Suggested fix B: If you wish to create sequence input, use '
3575
+ 'sequence_input_layer instead of input_layer. '
3576
+ 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
3577
+ self.categorical_column))
3578
+ # Feature has been already transformed. Return the intermediate
3579
+ # representation created by _transform_feature.
3580
+ return inputs.get(self)
3581
+
3582
+ def _get_sequence_dense_tensor(self,
3583
+ inputs,
3584
+ weight_collections=None,
3585
+ trainable=None):
3586
+ # Do nothing with weight_collections and trainable since no variables are
3587
+ # created in this function.
3588
+ del weight_collections
3589
+ del trainable
3590
+ if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
3591
+ raise ValueError(
3592
+ 'In indicator_column: {}. '
3593
+ 'categorical_column must be of type _SequenceCategoricalColumn '
3594
+ 'to use sequence_input_layer. '
3595
+ 'Suggested fix: Use one of sequence_categorical_column_with_*. '
3596
+ 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
3597
+ self.categorical_column))
3598
+ # Feature has been already transformed. Return the intermediate
3599
+ # representation created by _transform_feature.
3600
+ dense_tensor = inputs.get(self)
3601
+ sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
3602
+ sequence_length = fc_utils.sequence_length_from_sparse_tensor(
3603
+ sparse_tensors.id_tensor)
3604
+ return _SequenceDenseColumn.TensorSequenceLengthPair(
3605
+ dense_tensor=dense_tensor, sequence_length=sequence_length)
3606
+
3607
+
3608
+ def _verify_static_batch_size_equality(tensors, columns):
3609
+ """Validates that the first dim (batch size) of all tensors are equal or None.
3610
+
3611
+ Args:
3612
+ tensors: list of tensors to check.
3613
+ columns: list of feature columns matching tensors. Will be used for error
3614
+ messaging.
3615
+
3616
+ Raises:
3617
+ ValueError: if one of the tensors has a variant batch size
3618
+ """
3619
+ # bath_size is a tf.compat.v1.Dimension object.
3620
+ expected_batch_size = None
3621
+ for i in range(0, len(tensors)):
3622
+ if tensors[i].shape.dims[0].value is not None:
3623
+ if expected_batch_size is None:
3624
+ bath_size_column_index = i
3625
+ expected_batch_size = tensors[i].shape.dims[0]
3626
+ elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]):
3627
+ raise ValueError(
3628
+ 'Batch size (first dimension) of each feature must be same. '
3629
+ 'Batch size of columns ({}, {}): ({}, {})'.format(
3630
+ columns[bath_size_column_index].name, columns[i].name,
3631
+ expected_batch_size, tensors[i].shape.dims[0]))
3632
+
3633
+
3634
+ class _SequenceCategoricalColumn(_CategoricalColumn,
3635
+ collections.namedtuple(
3636
+ '_SequenceCategoricalColumn',
3637
+ ['categorical_column'])):
3638
+ """Represents sequences of categorical data."""
3639
+
3640
+ @property
3641
+ def name(self):
3642
+ return self.categorical_column.name
3643
+
3644
+ @property
3645
+ def _parse_example_spec(self):
3646
+ return self.categorical_column._parse_example_spec # pylint: disable=protected-access
3647
+
3648
+ def _transform_feature(self, inputs):
3649
+ return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access
3650
+
3651
+ @property
3652
+ def _num_buckets(self):
3653
+ return self.categorical_column._num_buckets # pylint: disable=protected-access
3654
+
3655
+ def _get_sparse_tensors(self,
3656
+ inputs,
3657
+ weight_collections=None,
3658
+ trainable=None):
3659
+ sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
3660
+ id_tensor = sparse_tensors.id_tensor
3661
+ weight_tensor = sparse_tensors.weight_tensor
3662
+
3663
+ # Expands third dimension, if necessary so that embeddings are not
3664
+ # combined during embedding lookup. If the tensor is already 3D, leave
3665
+ # as-is.
3666
+ shape = array_ops.shape(id_tensor)
3667
+ # Compute the third dimension explicitly instead of setting it to -1, as
3668
+ # that doesn't work for dynamically shaped tensors with 0-length at runtime.
3669
+ # This happens for empty sequences.
3670
+ target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])]
3671
+ id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)
3672
+ if weight_tensor is not None:
3673
+ weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
3674
+
3675
+ return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)