PyPI - easy-cs-rec-custommodel - Versions diffs - 0.8.6__py2.py3-none-any.whl - Mend

easy-cs-rec-custommodel 0.8.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of easy-cs-rec-custommodel might be problematic. Click here for more details.

Files changed (336) hide show

easy_cs_rec_custommodel-0.8.6.dist-info/LICENSE +203 -0
easy_cs_rec_custommodel-0.8.6.dist-info/METADATA +48 -0
easy_cs_rec_custommodel-0.8.6.dist-info/RECORD +336 -0
easy_cs_rec_custommodel-0.8.6.dist-info/WHEEL +6 -0
easy_cs_rec_custommodel-0.8.6.dist-info/top_level.txt +2 -0
easy_rec/__init__.py +114 -0
easy_rec/python/__init__.py +0 -0
easy_rec/python/builders/__init__.py +0 -0
easy_rec/python/builders/hyperparams_builder.py +78 -0
easy_rec/python/builders/loss_builder.py +333 -0
easy_rec/python/builders/optimizer_builder.py +211 -0
easy_rec/python/builders/strategy_builder.py +44 -0
easy_rec/python/compat/__init__.py +0 -0
easy_rec/python/compat/adam_s.py +245 -0
easy_rec/python/compat/array_ops.py +229 -0
easy_rec/python/compat/dynamic_variable.py +542 -0
easy_rec/python/compat/early_stopping.py +653 -0
easy_rec/python/compat/embedding_ops.py +162 -0
easy_rec/python/compat/embedding_parallel_saver.py +316 -0
easy_rec/python/compat/estimator_train.py +116 -0
easy_rec/python/compat/exporter.py +473 -0
easy_rec/python/compat/feature_column/__init__.py +0 -0
easy_rec/python/compat/feature_column/feature_column.py +3675 -0
easy_rec/python/compat/feature_column/feature_column_v2.py +5233 -0
easy_rec/python/compat/feature_column/sequence_feature_column.py +648 -0
easy_rec/python/compat/feature_column/utils.py +154 -0
easy_rec/python/compat/layers.py +329 -0
easy_rec/python/compat/ops.py +14 -0
easy_rec/python/compat/optimizers.py +619 -0
easy_rec/python/compat/queues.py +311 -0
easy_rec/python/compat/regularizers.py +208 -0
easy_rec/python/compat/sok_optimizer.py +440 -0
easy_rec/python/compat/sync_replicas_optimizer.py +528 -0
easy_rec/python/compat/weight_decay_optimizers.py +475 -0
easy_rec/python/core/__init__.py +0 -0
easy_rec/python/core/easyrec_metrics/__init__.py +24 -0
easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_pai.py +3702 -0
easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_tf.py +3768 -0
easy_rec/python/core/learning_schedules.py +228 -0
easy_rec/python/core/metrics.py +402 -0
easy_rec/python/core/sampler.py +844 -0
easy_rec/python/eval.py +102 -0
easy_rec/python/export.py +150 -0
easy_rec/python/feature_column/__init__.py +0 -0
easy_rec/python/feature_column/feature_column.py +664 -0
easy_rec/python/feature_column/feature_group.py +89 -0
easy_rec/python/hpo/__init__.py +0 -0
easy_rec/python/hpo/emr_hpo.py +140 -0
easy_rec/python/hpo/generate_hpo_sql.py +71 -0
easy_rec/python/hpo/pai_hpo.py +297 -0
easy_rec/python/inference/__init__.py +0 -0
easy_rec/python/inference/csv_predictor.py +189 -0
easy_rec/python/inference/hive_parquet_predictor.py +200 -0
easy_rec/python/inference/hive_predictor.py +166 -0
easy_rec/python/inference/odps_predictor.py +70 -0
easy_rec/python/inference/parquet_predictor.py +147 -0
easy_rec/python/inference/parquet_predictor_v2.py +147 -0
easy_rec/python/inference/predictor.py +621 -0
easy_rec/python/inference/processor/__init__.py +0 -0
easy_rec/python/inference/processor/test.py +170 -0
easy_rec/python/inference/vector_retrieve.py +124 -0
easy_rec/python/input/__init__.py +0 -0
easy_rec/python/input/batch_tfrecord_input.py +117 -0
easy_rec/python/input/criteo_binary_reader.py +259 -0
easy_rec/python/input/criteo_input.py +107 -0
easy_rec/python/input/csv_input.py +175 -0
easy_rec/python/input/csv_input_ex.py +72 -0
easy_rec/python/input/csv_input_v2.py +68 -0
easy_rec/python/input/datahub_input.py +320 -0
easy_rec/python/input/dummy_input.py +58 -0
easy_rec/python/input/hive_input.py +123 -0
easy_rec/python/input/hive_parquet_input.py +140 -0
easy_rec/python/input/hive_rtp_input.py +174 -0
easy_rec/python/input/input.py +1064 -0
easy_rec/python/input/kafka_dataset.py +144 -0
easy_rec/python/input/kafka_input.py +235 -0
easy_rec/python/input/load_parquet.py +317 -0
easy_rec/python/input/odps_input.py +101 -0
easy_rec/python/input/odps_input_v2.py +110 -0
easy_rec/python/input/odps_input_v3.py +132 -0
easy_rec/python/input/odps_rtp_input.py +187 -0
easy_rec/python/input/odps_rtp_input_v2.py +104 -0
easy_rec/python/input/parquet_input.py +397 -0
easy_rec/python/input/parquet_input_v2.py +180 -0
easy_rec/python/input/parquet_input_v3.py +203 -0
easy_rec/python/input/rtp_input.py +225 -0
easy_rec/python/input/rtp_input_v2.py +145 -0
easy_rec/python/input/tfrecord_input.py +100 -0
easy_rec/python/layers/__init__.py +0 -0
easy_rec/python/layers/backbone.py +571 -0
easy_rec/python/layers/capsule_layer.py +176 -0
easy_rec/python/layers/cmbf.py +390 -0
easy_rec/python/layers/common_layers.py +192 -0
easy_rec/python/layers/dnn.py +87 -0
easy_rec/python/layers/embed_input_layer.py +25 -0
easy_rec/python/layers/fm.py +26 -0
easy_rec/python/layers/input_layer.py +396 -0
easy_rec/python/layers/keras/__init__.py +34 -0
easy_rec/python/layers/keras/activation.py +114 -0
easy_rec/python/layers/keras/attention.py +267 -0
easy_rec/python/layers/keras/auxiliary_loss.py +47 -0
easy_rec/python/layers/keras/blocks.py +262 -0
easy_rec/python/layers/keras/bst.py +119 -0
easy_rec/python/layers/keras/custom_ops.py +250 -0
easy_rec/python/layers/keras/data_augment.py +133 -0
easy_rec/python/layers/keras/din.py +67 -0
easy_rec/python/layers/keras/einsum_dense.py +598 -0
easy_rec/python/layers/keras/embedding.py +81 -0
easy_rec/python/layers/keras/fibinet.py +251 -0
easy_rec/python/layers/keras/interaction.py +416 -0
easy_rec/python/layers/keras/layer_norm.py +364 -0
easy_rec/python/layers/keras/mask_net.py +166 -0
easy_rec/python/layers/keras/multi_head_attention.py +717 -0
easy_rec/python/layers/keras/multi_task.py +125 -0
easy_rec/python/layers/keras/numerical_embedding.py +376 -0
easy_rec/python/layers/keras/ppnet.py +194 -0
easy_rec/python/layers/keras/transformer.py +192 -0
easy_rec/python/layers/layer_norm.py +51 -0
easy_rec/python/layers/mmoe.py +83 -0
easy_rec/python/layers/multihead_attention.py +162 -0
easy_rec/python/layers/multihead_cross_attention.py +749 -0
easy_rec/python/layers/senet.py +73 -0
easy_rec/python/layers/seq_input_layer.py +134 -0
easy_rec/python/layers/sequence_feature_layer.py +249 -0
easy_rec/python/layers/uniter.py +301 -0
easy_rec/python/layers/utils.py +248 -0
easy_rec/python/layers/variational_dropout_layer.py +130 -0
easy_rec/python/loss/__init__.py +0 -0
easy_rec/python/loss/circle_loss.py +82 -0
easy_rec/python/loss/contrastive_loss.py +79 -0
easy_rec/python/loss/f1_reweight_loss.py +38 -0
easy_rec/python/loss/focal_loss.py +93 -0
easy_rec/python/loss/jrc_loss.py +128 -0
easy_rec/python/loss/listwise_loss.py +161 -0
easy_rec/python/loss/multi_similarity.py +68 -0
easy_rec/python/loss/pairwise_loss.py +307 -0
easy_rec/python/loss/softmax_loss_with_negative_mining.py +110 -0
easy_rec/python/loss/zero_inflated_lognormal.py +76 -0
easy_rec/python/main.py +878 -0
easy_rec/python/model/__init__.py +0 -0
easy_rec/python/model/autoint.py +73 -0
easy_rec/python/model/cmbf.py +47 -0
easy_rec/python/model/collaborative_metric_learning.py +182 -0
easy_rec/python/model/custom_model.py +323 -0
easy_rec/python/model/dat.py +138 -0
easy_rec/python/model/dbmtl.py +116 -0
easy_rec/python/model/dcn.py +70 -0
easy_rec/python/model/deepfm.py +106 -0
easy_rec/python/model/dlrm.py +73 -0
easy_rec/python/model/dropoutnet.py +207 -0
easy_rec/python/model/dssm.py +154 -0
easy_rec/python/model/dssm_senet.py +143 -0
easy_rec/python/model/dummy_model.py +48 -0
easy_rec/python/model/easy_rec_estimator.py +739 -0
easy_rec/python/model/easy_rec_model.py +467 -0
easy_rec/python/model/esmm.py +242 -0
easy_rec/python/model/fm.py +63 -0
easy_rec/python/model/match_model.py +357 -0
easy_rec/python/model/mind.py +445 -0
easy_rec/python/model/mmoe.py +70 -0
easy_rec/python/model/multi_task_model.py +303 -0
easy_rec/python/model/multi_tower.py +62 -0
easy_rec/python/model/multi_tower_bst.py +190 -0
easy_rec/python/model/multi_tower_din.py +130 -0
easy_rec/python/model/multi_tower_recall.py +68 -0
easy_rec/python/model/pdn.py +203 -0
easy_rec/python/model/ple.py +120 -0
easy_rec/python/model/rank_model.py +485 -0
easy_rec/python/model/rocket_launching.py +203 -0
easy_rec/python/model/simple_multi_task.py +54 -0
easy_rec/python/model/uniter.py +46 -0
easy_rec/python/model/wide_and_deep.py +121 -0
easy_rec/python/ops/1.12/incr_record.so +0 -0
easy_rec/python/ops/1.12/kafka.so +0 -0
easy_rec/python/ops/1.12/libcustom_ops.so +0 -0
easy_rec/python/ops/1.12/libembed_op.so +0 -0
easy_rec/python/ops/1.12/libhiredis.so.1.0.0 +0 -0
easy_rec/python/ops/1.12/librdkafka++.so.1 +0 -0
easy_rec/python/ops/1.12/librdkafka.so.1 +0 -0
easy_rec/python/ops/1.12/libredis++.so +0 -0
easy_rec/python/ops/1.12/libredis++.so.1 +0 -0
easy_rec/python/ops/1.12/libredis++.so.1.2.3 +0 -0
easy_rec/python/ops/1.12/libstr_avx_op.so +0 -0
easy_rec/python/ops/1.12/libwrite_sparse_kv.so +0 -0
easy_rec/python/ops/1.15/incr_record.so +0 -0
easy_rec/python/ops/1.15/kafka.so +0 -0
easy_rec/python/ops/1.15/libcustom_ops.so +0 -0
easy_rec/python/ops/1.15/libembed_op.so +0 -0
easy_rec/python/ops/1.15/libhiredis.so.1.0.0 +0 -0
easy_rec/python/ops/1.15/librdkafka++.so +0 -0
easy_rec/python/ops/1.15/librdkafka++.so.1 +0 -0
easy_rec/python/ops/1.15/librdkafka.so +0 -0
easy_rec/python/ops/1.15/librdkafka.so.1 +0 -0
easy_rec/python/ops/1.15/libredis++.so.1 +0 -0
easy_rec/python/ops/1.15/libstr_avx_op.so +0 -0
easy_rec/python/ops/2.12/libcustom_ops.so +0 -0
easy_rec/python/ops/2.12/libload_embed.so +0 -0
easy_rec/python/ops/2.12/libstr_avx_op.so +0 -0
easy_rec/python/ops/__init__.py +0 -0
easy_rec/python/ops/gen_kafka_ops.py +193 -0
easy_rec/python/ops/gen_str_avx_op.py +28 -0
easy_rec/python/ops/incr_record.py +30 -0
easy_rec/python/predict.py +170 -0
easy_rec/python/protos/__init__.py +0 -0
easy_rec/python/protos/autoint_pb2.py +122 -0
easy_rec/python/protos/backbone_pb2.py +1416 -0
easy_rec/python/protos/cmbf_pb2.py +435 -0
easy_rec/python/protos/collaborative_metric_learning_pb2.py +252 -0
easy_rec/python/protos/custom_model_pb2.py +57 -0
easy_rec/python/protos/dat_pb2.py +262 -0
easy_rec/python/protos/data_source_pb2.py +422 -0
easy_rec/python/protos/dataset_pb2.py +1920 -0
easy_rec/python/protos/dbmtl_pb2.py +191 -0
easy_rec/python/protos/dcn_pb2.py +197 -0
easy_rec/python/protos/deepfm_pb2.py +163 -0
easy_rec/python/protos/dlrm_pb2.py +163 -0
easy_rec/python/protos/dnn_pb2.py +329 -0
easy_rec/python/protos/dropoutnet_pb2.py +239 -0
easy_rec/python/protos/dssm_pb2.py +262 -0
easy_rec/python/protos/dssm_senet_pb2.py +282 -0
easy_rec/python/protos/easy_rec_model_pb2.py +1672 -0
easy_rec/python/protos/esmm_pb2.py +133 -0
easy_rec/python/protos/eval_pb2.py +930 -0
easy_rec/python/protos/export_pb2.py +379 -0
easy_rec/python/protos/feature_config_pb2.py +1359 -0
easy_rec/python/protos/fm_pb2.py +90 -0
easy_rec/python/protos/hive_config_pb2.py +138 -0
easy_rec/python/protos/hyperparams_pb2.py +624 -0
easy_rec/python/protos/keras_layer_pb2.py +692 -0
easy_rec/python/protos/layer_pb2.py +1936 -0
easy_rec/python/protos/loss_pb2.py +1713 -0
easy_rec/python/protos/mind_pb2.py +497 -0
easy_rec/python/protos/mmoe_pb2.py +215 -0
easy_rec/python/protos/multi_tower_pb2.py +295 -0
easy_rec/python/protos/multi_tower_recall_pb2.py +198 -0
easy_rec/python/protos/optimizer_pb2.py +2017 -0
easy_rec/python/protos/pdn_pb2.py +293 -0
easy_rec/python/protos/pipeline_pb2.py +516 -0
easy_rec/python/protos/ple_pb2.py +231 -0
easy_rec/python/protos/predict_pb2.py +1140 -0
easy_rec/python/protos/rocket_launching_pb2.py +169 -0
easy_rec/python/protos/seq_encoder_pb2.py +1084 -0
easy_rec/python/protos/simi_pb2.py +54 -0
easy_rec/python/protos/simple_multi_task_pb2.py +97 -0
easy_rec/python/protos/tf_predict_pb2.py +630 -0
easy_rec/python/protos/tower_pb2.py +661 -0
easy_rec/python/protos/train_pb2.py +1197 -0
easy_rec/python/protos/uniter_pb2.py +307 -0
easy_rec/python/protos/variational_dropout_pb2.py +91 -0
easy_rec/python/protos/wide_and_deep_pb2.py +131 -0
easy_rec/python/test/__init__.py +0 -0
easy_rec/python/test/csv_input_test.py +340 -0
easy_rec/python/test/custom_early_stop_func.py +19 -0
easy_rec/python/test/dh_local_run.py +104 -0
easy_rec/python/test/embed_test.py +155 -0
easy_rec/python/test/emr_run.py +119 -0
easy_rec/python/test/eval_metric_test.py +107 -0
easy_rec/python/test/excel_convert_test.py +64 -0
easy_rec/python/test/export_test.py +513 -0
easy_rec/python/test/fg_test.py +70 -0
easy_rec/python/test/hive_input_test.py +311 -0
easy_rec/python/test/hpo_test.py +235 -0
easy_rec/python/test/kafka_test.py +373 -0
easy_rec/python/test/local_incr_test.py +122 -0
easy_rec/python/test/loss_test.py +110 -0
easy_rec/python/test/odps_command.py +61 -0
easy_rec/python/test/odps_local_run.py +86 -0
easy_rec/python/test/odps_run.py +254 -0
easy_rec/python/test/odps_test_cls.py +39 -0
easy_rec/python/test/odps_test_prepare.py +198 -0
easy_rec/python/test/odps_test_util.py +237 -0
easy_rec/python/test/pre_check_test.py +54 -0
easy_rec/python/test/predictor_test.py +394 -0
easy_rec/python/test/rtp_convert_test.py +133 -0
easy_rec/python/test/run.py +138 -0
easy_rec/python/test/train_eval_test.py +1299 -0
easy_rec/python/test/util_test.py +85 -0
easy_rec/python/test/zero_inflated_lognormal_test.py +53 -0
easy_rec/python/tools/__init__.py +0 -0
easy_rec/python/tools/add_boundaries_to_config.py +67 -0
easy_rec/python/tools/add_feature_info_to_config.py +145 -0
easy_rec/python/tools/convert_config_format.py +48 -0
easy_rec/python/tools/convert_rtp_data.py +79 -0
easy_rec/python/tools/convert_rtp_fg.py +106 -0
easy_rec/python/tools/create_config_from_excel.py +427 -0
easy_rec/python/tools/criteo/__init__.py +0 -0
easy_rec/python/tools/criteo/convert_data.py +157 -0
easy_rec/python/tools/edit_lookup_graph.py +134 -0
easy_rec/python/tools/faiss_index_pai.py +116 -0
easy_rec/python/tools/feature_selection.py +316 -0
easy_rec/python/tools/hit_rate_ds.py +223 -0
easy_rec/python/tools/hit_rate_pai.py +138 -0
easy_rec/python/tools/pre_check.py +120 -0
easy_rec/python/tools/predict_and_chk.py +111 -0
easy_rec/python/tools/read_kafka.py +55 -0
easy_rec/python/tools/split_model_pai.py +286 -0
easy_rec/python/tools/split_pdn_model_pai.py +272 -0
easy_rec/python/tools/test_saved_model.py +80 -0
easy_rec/python/tools/view_saved_model.py +39 -0
easy_rec/python/tools/write_kafka.py +65 -0
easy_rec/python/train_eval.py +325 -0
easy_rec/python/utils/__init__.py +15 -0
easy_rec/python/utils/activation.py +120 -0
easy_rec/python/utils/check_utils.py +87 -0
easy_rec/python/utils/compat.py +14 -0
easy_rec/python/utils/config_util.py +652 -0
easy_rec/python/utils/constant.py +43 -0
easy_rec/python/utils/convert_rtp_fg.py +616 -0
easy_rec/python/utils/dag.py +192 -0
easy_rec/python/utils/distribution_utils.py +268 -0
easy_rec/python/utils/ds_util.py +65 -0
easy_rec/python/utils/embedding_utils.py +73 -0
easy_rec/python/utils/estimator_utils.py +1036 -0
easy_rec/python/utils/export_big_model.py +630 -0
easy_rec/python/utils/expr_util.py +118 -0
easy_rec/python/utils/fg_util.py +53 -0
easy_rec/python/utils/hit_rate_utils.py +220 -0
easy_rec/python/utils/hive_utils.py +183 -0
easy_rec/python/utils/hpo_util.py +137 -0
easy_rec/python/utils/hvd_utils.py +56 -0
easy_rec/python/utils/input_utils.py +108 -0
easy_rec/python/utils/io_util.py +282 -0
easy_rec/python/utils/load_class.py +249 -0
easy_rec/python/utils/meta_graph_editor.py +941 -0
easy_rec/python/utils/multi_optimizer.py +62 -0
easy_rec/python/utils/numpy_utils.py +18 -0
easy_rec/python/utils/odps_util.py +79 -0
easy_rec/python/utils/pai_util.py +86 -0
easy_rec/python/utils/proto_util.py +90 -0
easy_rec/python/utils/restore_filter.py +89 -0
easy_rec/python/utils/shape_utils.py +432 -0
easy_rec/python/utils/static_shape.py +71 -0
easy_rec/python/utils/test_utils.py +866 -0
easy_rec/python/utils/tf_utils.py +56 -0
easy_rec/version.py +4 -0
test/__init__.py +0 -0

easy_rec/python/input/parquet_input.py ADDED Viewed

@@ -0,0 +1,397 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import multiprocessing
+import queue
+import time
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from easy_rec.python.compat import queues
+from easy_rec.python.input import load_parquet
+from easy_rec.python.input.input import Input
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+class ParquetInput(Input):
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None,
+               **kwargs):
+    super(ParquetInput,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config, **kwargs)
+    self._need_pack = True
+    if input_path is None:
+      return
+    self._input_files = []
+    for sub_path in input_path.strip().split(','):
+      self._input_files.extend(tf.gfile.Glob(sub_path))
+    logging.info('parquet input_path=%s file_num=%d' %
+                 (input_path, len(self._input_files)))
+    mp_ctxt = multiprocessing.get_context('spawn')
+    self._data_que = queues.Queue(
+        name='data_que', ctx=mp_ctxt, maxsize=self._data_config.prefetch_size)
+    file_num = len(self._input_files)
+    logging.info('[task_index=%d] total_file_num=%d task_num=%d' %
+                 (task_index, file_num, task_num))
+    self._my_files = []
+    for file_id in range(file_num):
+      if (file_id % task_num) == task_index:
+        self._my_files.append(self._input_files[file_id])
+    # self._my_files = self._input_files
+    logging.info('[task_index=%d] task_file_num=%d' %
+                 (task_index, len(self._my_files)))
+    self._file_que = queues.Queue(name='file_que', ctx=mp_ctxt)
+    self._num_proc = 8
+    if file_num < self._num_proc:
+      self._num_proc = file_num
+    self._proc_start = False
+    self._proc_start_que = queues.Queue(name='proc_start_que', ctx=mp_ctxt)
+    self._proc_stop = False
+    self._proc_stop_que = queues.Queue(name='proc_stop_que', ctx=mp_ctxt)
+    self._reserve_fields = None
+    self._reserve_types = None
+    if 'reserve_fields' in kwargs and 'reserve_types' in kwargs:
+      self._reserve_fields = kwargs['reserve_fields']
+      self._reserve_types = kwargs['reserve_types']
+    # indicator whether is called from Predictor, do not go pass
+    if 'is_predictor' in kwargs:
+      self._is_predictor = kwargs['is_predictor']
+    else:
+      self._is_predictor = False
+    self._proc_arr = None
+    self._sparse_fea_names = []
+    self._dense_fea_names = []
+    self._dense_fea_cfgs = []
+    self._total_dense_fea_dim = 0
+    for fc in self._feature_configs:
+      feature_type = fc.feature_type
+      if feature_type in [fc.IdFeature, fc.TagFeature]:
+        input_name0 = fc.input_names[0]
+        self._sparse_fea_names.append(input_name0)
+      elif feature_type in [fc.RawFeature]:
+        input_name0 = fc.input_names[0]
+        self._dense_fea_names.append(input_name0)
+        self._dense_fea_cfgs.append(fc)
+        self._total_dense_fea_dim += fc.raw_input_dim
+      else:
+        assert False, 'feature_type[%s] not supported' % str(feature_type)
+  def _rebuild_que(self):
+    mp_ctxt = multiprocessing.get_context('spawn')
+    self._data_que = queues.Queue(
+        name='data_que', ctx=mp_ctxt, maxsize=self._data_config.prefetch_size)
+    self._file_que = queues.Queue(name='file_que', ctx=mp_ctxt)
+    self._proc_start_que = queues.Queue(name='proc_start_que', ctx=mp_ctxt)
+    self._proc_stop_que = queues.Queue(name='proc_stop_que', ctx=mp_ctxt)
+  def _sample_generator(self):
+    if not self._proc_start:
+      self._proc_start = True
+      for proc in (self._proc_arr):
+        self._proc_start_que.put(True)
+        logging.info('task[%s] data_proc=%s is_alive=%s' %
+                     (self._task_index, proc, proc.is_alive()))
+    done_proc_cnt = 0
+    fetch_timeout_cnt = 0
+    # # for mock purpose
+    # all_samples = []
+    # while len(all_samples) < 64:
+    #   try:
+    #     sample = self._data_que.get(block=False)
+    #     all_samples.append(sample)
+    #   except queue.Empty:
+    #     continue
+    # sid = 0
+    # while True:
+    #   yield all_samples[sid]
+    #   sid += 1
+    #   if sid >= len(all_samples):
+    #     sid = 0
+    fetch_good_cnt = 0
+    while True:
+      try:
+        sample = self._data_que.get(timeout=1)
+        if sample is None:
+          done_proc_cnt += 1
+        else:
+          fetch_good_cnt += 1
+          yield sample
+        if fetch_good_cnt % 200 == 0:
+          logging.info(
+              'task[%d] fetch_batch_cnt=%d, fetch_timeout_cnt=%d, qsize=%d' %
+              (self._task_index, fetch_good_cnt, fetch_timeout_cnt,
+               self._data_que.qsize()))
+      except queue.Empty:
+        fetch_timeout_cnt += 1
+        if done_proc_cnt >= len(self._proc_arr):
+          logging.info('all sample finished, fetch_timeout_cnt=%d' %
+                       fetch_timeout_cnt)
+          break
+      except Exception as ex:
+        logging.warning('task[%d] get from data_que exception: %s' %
+                        (self._task_index, str(ex)))
+        break
+    logging.info('task[%d] sample_generator: total_batches=%d' %
+                 (self._task_index, fetch_good_cnt))
+  def stop(self):
+    if self._proc_arr is None or len(self._proc_arr) == 0:
+      return
+    logging.info('task[%d] will stop dataset procs, proc_num=%d' %
+                 (self._task_index, len(self._proc_arr)))
+    self._file_que.close()
+    if self._proc_start:
+      logging.info('try close data que')
+      for _ in range(len(self._proc_arr)):
+        self._proc_stop_que.put(1)
+      self._proc_stop_que.close()
+      def _any_alive():
+        for proc in self._proc_arr:
+          if proc.is_alive():
+            return True
+        return False
+      # to ensure the sender part of the python Queue could exit
+      while _any_alive():
+        try:
+          self._data_que.get(timeout=1)
+        except Exception:
+          pass
+      time.sleep(1)
+      self._data_que.close()
+      logging.info('data que closed')
+      # import time
+      # time.sleep(10)
+      for proc in self._proc_arr:
+        # proc.terminate()
+        proc.join()
+      logging.info('join proc done')
+      # rebuild for next run, which is necessary for evaluation
+      self._rebuild_que()
+      self._proc_arr = None
+      self._proc_start = False
+      self._proc_stop = False
+  def _to_fea_dict(self, input_dict):
+    fea_dict = {}
+    if len(self._sparse_fea_names) > 0:
+      if self._has_ev:
+        tmp_vals, tmp_lens = input_dict['sparse_fea'][1], input_dict[
+            'sparse_fea'][0]
+        fea_dict['sparse_fea'] = (tmp_vals, tmp_lens)
+      else:
+        tmp_vals, tmp_lens = input_dict['sparse_fea'][1], input_dict[
+            'sparse_fea'][0]
+        num_buckets = -1
+        for fc in self._feature_configs:
+          if fc.num_buckets > 0:
+            if num_buckets < 0:
+              num_buckets = fc.num_buckets
+            else:
+              assert num_buckets == fc.num_buckets, 'all features must share the same buckets, but are %d and %s' % (
+                  num_buckets, str(fc))
+        fea_dict['sparse_fea'] = (tmp_vals % num_buckets, tmp_lens)
+    if len(self._dense_fea_names) > 0:
+      fea_dict['dense_fea'] = input_dict['dense_fea']
+    output_dict = {'feature': fea_dict}
+    lbl_dict = {}
+    for lbl_name in self._label_fields:
+      if lbl_name in input_dict:
+        lbl_dict[lbl_name] = input_dict[lbl_name]
+    if len(lbl_dict) > 0:
+      output_dict['label'] = lbl_dict
+    if self._reserve_fields is not None:
+      output_dict['reserve'] = input_dict['reserve']
+    return output_dict
+  def add_fea_type_and_shape(self, out_types, out_shapes):
+    # all features are packed into one tuple sparse_fea
+    #   first field: field lengths
+    #   second field: field values
+    if len(self._sparse_fea_names) > 0:
+      out_types['sparse_fea'] = (tf.int32, tf.int64)
+      out_shapes['sparse_fea'] = (tf.TensorShape([None]), tf.TensorShape([None
+                                                                          ]))
+    if len(self._dense_fea_names) > 0:
+      out_types['dense_fea'] = tf.float32
+      out_shapes['dense_fea'] = tf.TensorShape(
+          [None, self._total_dense_fea_dim])
+  def _build(self, mode, params):
+    if mode == tf.estimator.ModeKeys.TRAIN and self._data_config.num_epochs > 1:
+      logging.info('will repeat train data for %d epochs' %
+                   self._data_config.num_epochs)
+      my_files = self._my_files * self._data_config.num_epochs
+    else:
+      my_files = self._my_files
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      drop_remainder = self._data_config.drop_remainder
+      lbl_fields = self._label_fields
+    else:
+      lbl_fields = self._label_fields
+      if mode == tf.estimator.ModeKeys.PREDICT:
+        lbl_fields = None
+      drop_remainder = False
+    self._proc_arr = load_parquet.start_data_proc(
+        self._task_index,
+        self._task_num,
+        self._num_proc,
+        self._file_que,
+        self._data_que,
+        self._proc_start_que,
+        self._proc_stop_que,
+        self._batch_size,
+        lbl_fields,
+        # self._effective_fields,
+        self._sparse_fea_names,
+        self._dense_fea_names,
+        self._dense_fea_cfgs,
+        self._reserve_fields,
+        drop_remainder,
+        need_pack=self._need_pack)
+    for input_file in my_files:
+      self._file_que.put(input_file)
+    # add end signal
+    for proc in self._proc_arr:
+      self._file_que.put(None)
+    logging.info('add input_files to file_que, qsize=%d' %
+                 self._file_que.qsize())
+    out_types = {}
+    out_shapes = {}
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      for k in self._label_fields:
+        out_types[k] = tf.int32
+        out_shapes[k] = tf.TensorShape([None])
+    if self._reserve_fields is not None:
+      out_types['reserve'] = {}
+      out_shapes['reserve'] = {}
+      for k, t in zip(self._reserve_fields, self._reserve_types):
+        out_types['reserve'][k] = t
+        out_shapes['reserve'][k] = tf.TensorShape([None])
+    self.add_fea_type_and_shape(out_types, out_shapes)
+    dataset = tf.data.Dataset.from_generator(
+        self._sample_generator,
+        output_types=out_types,
+        output_shapes=out_shapes)
+    num_parallel_calls = self._data_config.num_parallel_calls
+    dataset = dataset.map(
+        self._to_fea_dict, num_parallel_calls=num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    # Note: Input._preprocess is currently not supported as all features
+    #      are concatenated together
+    # dataset = dataset.map(
+    #     map_func=self._preprocess, num_parallel_calls=num_parallel_calls)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      dataset = dataset.map(lambda x:
+                            (self._get_features(x), self._get_labels(x)))
+      # initial test show that prefetch to gpu has no performance gain
+      # dataset = dataset.apply(tf.data.experimental.prefetch_to_device('/gpu:0'))
+    else:
+      if self._is_predictor:
+        dataset = dataset.map(self._get_for_predictor)
+      else:
+        dataset = dataset.map(lambda x: self._get_features(x))
+      dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    return dataset
+  def _get_for_predictor(self, fea_dict):
+    out_dict = {
+        'feature': {
+            'ragged_ids': fea_dict['feature']['sparse_fea'][0],
+            'ragged_lens': fea_dict['feature']['sparse_fea'][1]
+        }
+    }
+    if self._is_predictor and self._reserve_fields is not None:
+      out_dict['reserve'] = fea_dict['reserve']
+    return out_dict
+  def create_input(self, export_config=None):
+    def _input_fn(mode=None, params=None, config=None):
+      """Build input_fn for estimator.
+      Args:
+        mode: tf.estimator.ModeKeys.(TRAIN, EVAL, PREDICT)
+        params: `dict` of hyper parameters, from Estimator
+        config: tf.estimator.RunConfig instance
+      Return:
+        if mode is not None, return:
+            features: inputs to the model.
+            labels: groundtruth
+        else, return:
+            tf.estimator.export.ServingInputReceiver instance
+      """
+      if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL,
+                  tf.estimator.ModeKeys.PREDICT):
+        # build dataset from self._config.input_path
+        self._mode = mode
+        dataset = self._build(mode, params)
+        return dataset
+      elif mode is None:  # serving_input_receiver_fn for export SavedModel
+        inputs, features = {}, {}
+        if len(self._sparse_fea_names) > 0:
+          ragged_ids = array_ops.placeholder(
+              tf.int64, [None], name='ragged_ids')
+          ragged_lens = array_ops.placeholder(
+              tf.int32, [None], name='ragged_lens')
+          inputs = {'ragged_ids': ragged_ids, 'ragged_lens': ragged_lens}
+          if self._has_ev:
+            features = {'ragged_ids': ragged_ids, 'ragged_lens': ragged_lens}
+          else:
+            features = {
+                'ragged_ids': ragged_ids % self._feature_configs[0].num_buckets,
+                'ragged_lens': ragged_lens
+            }
+        if len(self._dense_fea_names) > 0:
+          inputs['dense_fea'] = array_ops.placeholder(
+              tf.float32, [None, self._total_dense_fea_dim], name='dense_fea')
+          features['dense_fea'] = inputs['dense_fea']
+        return tf.estimator.export.ServingInputReceiver(features, inputs)
+    _input_fn.input_creator = self
+    return _input_fn

easy_rec/python/input/parquet_input_v2.py ADDED Viewed

@@ -0,0 +1,180 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# import logging
+import os
+# import numpy as np
+# import pandas as pd
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+# from tensorflow.python.ops import math_ops
+# from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import string_ops
+from easy_rec.python.input.parquet_input import ParquetInput
+from easy_rec.python.utils import conditional
+# from easy_rec.python.utils.tf_utils import get_tf_type
+class ParquetInputV2(ParquetInput):
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None,
+               **kwargs):
+    super(ParquetInputV2,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config, **kwargs)
+    self._need_pack = False
+  def _predictor_preprocess(self, input_dict):
+    # when the ParquetInputV2 is build from ParquetPredictorV2
+    # the feature preprocess stage will be skipped.
+    fea_dict = {}
+    for k in input_dict:
+      vals = input_dict[k]
+      if isinstance(vals, tuple) and len(vals) == 2 and k != 'reserve':
+        fea_dict[k + '/lens'] = vals[0]
+        fea_dict[k + '/ids'] = vals[1]
+      else:
+        fea_dict[k] = vals
+    return fea_dict
+  def _to_fea_dict(self, input_dict):
+    if self._is_predictor:
+      fea_dict = self._predictor_preprocess(input_dict)
+    else:
+      fea_dict = self._preprocess(input_dict)
+    output_dict = {'feature': fea_dict}
+    lbl_dict = {}
+    for lbl_name in self._label_fields:
+      if lbl_name in input_dict:
+        lbl_dict[lbl_name] = input_dict[lbl_name]
+    if len(lbl_dict) > 0:
+      output_dict['label'] = lbl_dict
+    if self._reserve_fields is not None:
+      output_dict['reserve'] = input_dict['reserve']
+    return output_dict
+  def add_fea_type_and_shape(self, out_types, out_shapes):
+    # overload ParquetInput.build_type_and_shape
+    for k in self._sparse_fea_names:
+      out_types[k] = (tf.int32, tf.int64)
+      out_shapes[k] = (tf.TensorShape([None]), tf.TensorShape([None]))
+    for fc in self._dense_fea_cfgs:
+      k = fc.input_names[0]
+      out_types[k] = tf.float32
+      out_shapes[k] = tf.TensorShape([None, fc.raw_input_dim])
+  def _preprocess(self, inputs=None):
+    features = {}
+    placeholders = {}
+    for fc in self._feature_configs:
+      feature_name = fc.feature_name if fc.feature_name != '' else fc.input_names[
+          0]
+      feature_type = fc.feature_type
+      if feature_type in [fc.IdFeature, fc.TagFeature]:
+        input_name0 = fc.input_names[0]
+        if inputs is not None:
+          input_lens, input_vals = inputs[input_name0]
+        else:
+          if input_name0 in placeholders:
+            input_lens, input_vals = placeholders[input_name0]
+          else:
+            input_vals = array_ops.placeholder(
+                dtypes.int64, [None], name=input_name0 + '/ids')
+            input_lens = array_ops.placeholder(
+                dtypes.int64, [None], name=input_name0 + '/lens')
+            placeholders[input_name0] = (input_lens, input_vals)
+        if not self._has_ev:
+          if fc.num_buckets > 0:
+            input_vals = input_vals % fc.num_buckets
+          else:
+            input_vals = string_ops.as_string(input_vals)
+        features[feature_name] = tf.RaggedTensor.from_row_lengths(
+            values=input_vals, row_lengths=input_lens)
+      elif feature_type in [fc.RawFeature]:
+        input_name0 = fc.input_names[0]
+        if inputs is not None:
+          input_vals = inputs[input_name0]
+        else:
+          if input_name0 in placeholders:
+            input_vals = placeholders[input_name0]
+          else:
+            if fc.raw_input_dim > 1:
+              input_vals = array_ops.placeholder(
+                  dtypes.float32, [None, fc.raw_input_dim], name=input_name0)
+            else:
+              input_vals = array_ops.placeholder(
+                  dtypes.float32, [None], name=input_name0)
+            placeholders[input_name0] = input_vals
+        features[feature_name] = input_vals
+      else:
+        assert False, 'feature_type[%s] not supported' % str(feature_type)
+    if inputs is not None:
+      return features
+    else:
+      inputs = {}
+      for key in placeholders:
+        vals = placeholders[key]
+        if isinstance(vals, tuple):
+          inputs[key + '/lens'] = vals[0]
+          inputs[key + '/ids'] = vals[1]
+        else:
+          inputs[key] = vals
+      return features, inputs
+  def _get_for_predictor(self, fea_dict):
+    # called by ParquetInputV2._build, format:
+    # {
+    #   "feature": {"user_id/ids":..., "user_id/lens":..., ... },
+    #   "reserve": {"sample_id":..., ...}
+    # }
+    return fea_dict
+  def create_input(self, export_config=None):
+    def _input_fn(mode=None, params=None, config=None):
+      """Build input_fn for estimator.
+      Args:
+        mode: tf.estimator.ModeKeys.(TRAIN, EVAL, PREDICT)
+        params: `dict` of hyper parameters, from Estimator
+        config: tf.estimator.RunConfig instance
+      Return:
+        if mode is not None, return:
+            features: inputs to the model.
+            labels: groundtruth
+        else, return:
+            tf.estimator.export.ServingInputReceiver instance
+      """
+      if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL,
+                  tf.estimator.ModeKeys.PREDICT):
+        # build dataset from self._config.input_path
+        self._mode = mode
+        dataset = self._build(mode, params)
+        return dataset
+      elif mode is None:  # serving_input_receiver_fn for export SavedModel
+        place_on_cpu = os.getenv('place_embedding_on_cpu')
+        place_on_cpu = bool(place_on_cpu) if place_on_cpu else False
+        with conditional(place_on_cpu, ops.device('/CPU:0')):
+          features, inputs = self._preprocess()
+        return tf.estimator.export.ServingInputReceiver(features, inputs)
+    _input_fn.input_creator = self
+    return _input_fn