PyPI - easy-cs-rec-custommodel - Versions diffs - 0.8.6__py2.py3-none-any.whl - Mend

easy-cs-rec-custommodel 0.8.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of easy-cs-rec-custommodel might be problematic. Click here for more details.

Files changed (336) hide show

easy_cs_rec_custommodel-0.8.6.dist-info/LICENSE +203 -0
easy_cs_rec_custommodel-0.8.6.dist-info/METADATA +48 -0
easy_cs_rec_custommodel-0.8.6.dist-info/RECORD +336 -0
easy_cs_rec_custommodel-0.8.6.dist-info/WHEEL +6 -0
easy_cs_rec_custommodel-0.8.6.dist-info/top_level.txt +2 -0
easy_rec/__init__.py +114 -0
easy_rec/python/__init__.py +0 -0
easy_rec/python/builders/__init__.py +0 -0
easy_rec/python/builders/hyperparams_builder.py +78 -0
easy_rec/python/builders/loss_builder.py +333 -0
easy_rec/python/builders/optimizer_builder.py +211 -0
easy_rec/python/builders/strategy_builder.py +44 -0
easy_rec/python/compat/__init__.py +0 -0
easy_rec/python/compat/adam_s.py +245 -0
easy_rec/python/compat/array_ops.py +229 -0
easy_rec/python/compat/dynamic_variable.py +542 -0
easy_rec/python/compat/early_stopping.py +653 -0
easy_rec/python/compat/embedding_ops.py +162 -0
easy_rec/python/compat/embedding_parallel_saver.py +316 -0
easy_rec/python/compat/estimator_train.py +116 -0
easy_rec/python/compat/exporter.py +473 -0
easy_rec/python/compat/feature_column/__init__.py +0 -0
easy_rec/python/compat/feature_column/feature_column.py +3675 -0
easy_rec/python/compat/feature_column/feature_column_v2.py +5233 -0
easy_rec/python/compat/feature_column/sequence_feature_column.py +648 -0
easy_rec/python/compat/feature_column/utils.py +154 -0
easy_rec/python/compat/layers.py +329 -0
easy_rec/python/compat/ops.py +14 -0
easy_rec/python/compat/optimizers.py +619 -0
easy_rec/python/compat/queues.py +311 -0
easy_rec/python/compat/regularizers.py +208 -0
easy_rec/python/compat/sok_optimizer.py +440 -0
easy_rec/python/compat/sync_replicas_optimizer.py +528 -0
easy_rec/python/compat/weight_decay_optimizers.py +475 -0
easy_rec/python/core/__init__.py +0 -0
easy_rec/python/core/easyrec_metrics/__init__.py +24 -0
easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_pai.py +3702 -0
easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_tf.py +3768 -0
easy_rec/python/core/learning_schedules.py +228 -0
easy_rec/python/core/metrics.py +402 -0
easy_rec/python/core/sampler.py +844 -0
easy_rec/python/eval.py +102 -0
easy_rec/python/export.py +150 -0
easy_rec/python/feature_column/__init__.py +0 -0
easy_rec/python/feature_column/feature_column.py +664 -0
easy_rec/python/feature_column/feature_group.py +89 -0
easy_rec/python/hpo/__init__.py +0 -0
easy_rec/python/hpo/emr_hpo.py +140 -0
easy_rec/python/hpo/generate_hpo_sql.py +71 -0
easy_rec/python/hpo/pai_hpo.py +297 -0
easy_rec/python/inference/__init__.py +0 -0
easy_rec/python/inference/csv_predictor.py +189 -0
easy_rec/python/inference/hive_parquet_predictor.py +200 -0
easy_rec/python/inference/hive_predictor.py +166 -0
easy_rec/python/inference/odps_predictor.py +70 -0
easy_rec/python/inference/parquet_predictor.py +147 -0
easy_rec/python/inference/parquet_predictor_v2.py +147 -0
easy_rec/python/inference/predictor.py +621 -0
easy_rec/python/inference/processor/__init__.py +0 -0
easy_rec/python/inference/processor/test.py +170 -0
easy_rec/python/inference/vector_retrieve.py +124 -0
easy_rec/python/input/__init__.py +0 -0
easy_rec/python/input/batch_tfrecord_input.py +117 -0
easy_rec/python/input/criteo_binary_reader.py +259 -0
easy_rec/python/input/criteo_input.py +107 -0
easy_rec/python/input/csv_input.py +175 -0
easy_rec/python/input/csv_input_ex.py +72 -0
easy_rec/python/input/csv_input_v2.py +68 -0
easy_rec/python/input/datahub_input.py +320 -0
easy_rec/python/input/dummy_input.py +58 -0
easy_rec/python/input/hive_input.py +123 -0
easy_rec/python/input/hive_parquet_input.py +140 -0
easy_rec/python/input/hive_rtp_input.py +174 -0
easy_rec/python/input/input.py +1064 -0
easy_rec/python/input/kafka_dataset.py +144 -0
easy_rec/python/input/kafka_input.py +235 -0
easy_rec/python/input/load_parquet.py +317 -0
easy_rec/python/input/odps_input.py +101 -0
easy_rec/python/input/odps_input_v2.py +110 -0
easy_rec/python/input/odps_input_v3.py +132 -0
easy_rec/python/input/odps_rtp_input.py +187 -0
easy_rec/python/input/odps_rtp_input_v2.py +104 -0
easy_rec/python/input/parquet_input.py +397 -0
easy_rec/python/input/parquet_input_v2.py +180 -0
easy_rec/python/input/parquet_input_v3.py +203 -0
easy_rec/python/input/rtp_input.py +225 -0
easy_rec/python/input/rtp_input_v2.py +145 -0
easy_rec/python/input/tfrecord_input.py +100 -0
easy_rec/python/layers/__init__.py +0 -0
easy_rec/python/layers/backbone.py +571 -0
easy_rec/python/layers/capsule_layer.py +176 -0
easy_rec/python/layers/cmbf.py +390 -0
easy_rec/python/layers/common_layers.py +192 -0
easy_rec/python/layers/dnn.py +87 -0
easy_rec/python/layers/embed_input_layer.py +25 -0
easy_rec/python/layers/fm.py +26 -0
easy_rec/python/layers/input_layer.py +396 -0
easy_rec/python/layers/keras/__init__.py +34 -0
easy_rec/python/layers/keras/activation.py +114 -0
easy_rec/python/layers/keras/attention.py +267 -0
easy_rec/python/layers/keras/auxiliary_loss.py +47 -0
easy_rec/python/layers/keras/blocks.py +262 -0
easy_rec/python/layers/keras/bst.py +119 -0
easy_rec/python/layers/keras/custom_ops.py +250 -0
easy_rec/python/layers/keras/data_augment.py +133 -0
easy_rec/python/layers/keras/din.py +67 -0
easy_rec/python/layers/keras/einsum_dense.py +598 -0
easy_rec/python/layers/keras/embedding.py +81 -0
easy_rec/python/layers/keras/fibinet.py +251 -0
easy_rec/python/layers/keras/interaction.py +416 -0
easy_rec/python/layers/keras/layer_norm.py +364 -0
easy_rec/python/layers/keras/mask_net.py +166 -0
easy_rec/python/layers/keras/multi_head_attention.py +717 -0
easy_rec/python/layers/keras/multi_task.py +125 -0
easy_rec/python/layers/keras/numerical_embedding.py +376 -0
easy_rec/python/layers/keras/ppnet.py +194 -0
easy_rec/python/layers/keras/transformer.py +192 -0
easy_rec/python/layers/layer_norm.py +51 -0
easy_rec/python/layers/mmoe.py +83 -0
easy_rec/python/layers/multihead_attention.py +162 -0
easy_rec/python/layers/multihead_cross_attention.py +749 -0
easy_rec/python/layers/senet.py +73 -0
easy_rec/python/layers/seq_input_layer.py +134 -0
easy_rec/python/layers/sequence_feature_layer.py +249 -0
easy_rec/python/layers/uniter.py +301 -0
easy_rec/python/layers/utils.py +248 -0
easy_rec/python/layers/variational_dropout_layer.py +130 -0
easy_rec/python/loss/__init__.py +0 -0
easy_rec/python/loss/circle_loss.py +82 -0
easy_rec/python/loss/contrastive_loss.py +79 -0
easy_rec/python/loss/f1_reweight_loss.py +38 -0
easy_rec/python/loss/focal_loss.py +93 -0
easy_rec/python/loss/jrc_loss.py +128 -0
easy_rec/python/loss/listwise_loss.py +161 -0
easy_rec/python/loss/multi_similarity.py +68 -0
easy_rec/python/loss/pairwise_loss.py +307 -0
easy_rec/python/loss/softmax_loss_with_negative_mining.py +110 -0
easy_rec/python/loss/zero_inflated_lognormal.py +76 -0
easy_rec/python/main.py +878 -0
easy_rec/python/model/__init__.py +0 -0
easy_rec/python/model/autoint.py +73 -0
easy_rec/python/model/cmbf.py +47 -0
easy_rec/python/model/collaborative_metric_learning.py +182 -0
easy_rec/python/model/custom_model.py +323 -0
easy_rec/python/model/dat.py +138 -0
easy_rec/python/model/dbmtl.py +116 -0
easy_rec/python/model/dcn.py +70 -0
easy_rec/python/model/deepfm.py +106 -0
easy_rec/python/model/dlrm.py +73 -0
easy_rec/python/model/dropoutnet.py +207 -0
easy_rec/python/model/dssm.py +154 -0
easy_rec/python/model/dssm_senet.py +143 -0
easy_rec/python/model/dummy_model.py +48 -0
easy_rec/python/model/easy_rec_estimator.py +739 -0
easy_rec/python/model/easy_rec_model.py +467 -0
easy_rec/python/model/esmm.py +242 -0
easy_rec/python/model/fm.py +63 -0
easy_rec/python/model/match_model.py +357 -0
easy_rec/python/model/mind.py +445 -0
easy_rec/python/model/mmoe.py +70 -0
easy_rec/python/model/multi_task_model.py +303 -0
easy_rec/python/model/multi_tower.py +62 -0
easy_rec/python/model/multi_tower_bst.py +190 -0
easy_rec/python/model/multi_tower_din.py +130 -0
easy_rec/python/model/multi_tower_recall.py +68 -0
easy_rec/python/model/pdn.py +203 -0
easy_rec/python/model/ple.py +120 -0
easy_rec/python/model/rank_model.py +485 -0
easy_rec/python/model/rocket_launching.py +203 -0
easy_rec/python/model/simple_multi_task.py +54 -0
easy_rec/python/model/uniter.py +46 -0
easy_rec/python/model/wide_and_deep.py +121 -0
easy_rec/python/ops/1.12/incr_record.so +0 -0
easy_rec/python/ops/1.12/kafka.so +0 -0
easy_rec/python/ops/1.12/libcustom_ops.so +0 -0
easy_rec/python/ops/1.12/libembed_op.so +0 -0
easy_rec/python/ops/1.12/libhiredis.so.1.0.0 +0 -0
easy_rec/python/ops/1.12/librdkafka++.so.1 +0 -0
easy_rec/python/ops/1.12/librdkafka.so.1 +0 -0
easy_rec/python/ops/1.12/libredis++.so +0 -0
easy_rec/python/ops/1.12/libredis++.so.1 +0 -0
easy_rec/python/ops/1.12/libredis++.so.1.2.3 +0 -0
easy_rec/python/ops/1.12/libstr_avx_op.so +0 -0
easy_rec/python/ops/1.12/libwrite_sparse_kv.so +0 -0
easy_rec/python/ops/1.15/incr_record.so +0 -0
easy_rec/python/ops/1.15/kafka.so +0 -0
easy_rec/python/ops/1.15/libcustom_ops.so +0 -0
easy_rec/python/ops/1.15/libembed_op.so +0 -0
easy_rec/python/ops/1.15/libhiredis.so.1.0.0 +0 -0
easy_rec/python/ops/1.15/librdkafka++.so +0 -0
easy_rec/python/ops/1.15/librdkafka++.so.1 +0 -0
easy_rec/python/ops/1.15/librdkafka.so +0 -0
easy_rec/python/ops/1.15/librdkafka.so.1 +0 -0
easy_rec/python/ops/1.15/libredis++.so.1 +0 -0
easy_rec/python/ops/1.15/libstr_avx_op.so +0 -0
easy_rec/python/ops/2.12/libcustom_ops.so +0 -0
easy_rec/python/ops/2.12/libload_embed.so +0 -0
easy_rec/python/ops/2.12/libstr_avx_op.so +0 -0
easy_rec/python/ops/__init__.py +0 -0
easy_rec/python/ops/gen_kafka_ops.py +193 -0
easy_rec/python/ops/gen_str_avx_op.py +28 -0
easy_rec/python/ops/incr_record.py +30 -0
easy_rec/python/predict.py +170 -0
easy_rec/python/protos/__init__.py +0 -0
easy_rec/python/protos/autoint_pb2.py +122 -0
easy_rec/python/protos/backbone_pb2.py +1416 -0
easy_rec/python/protos/cmbf_pb2.py +435 -0
easy_rec/python/protos/collaborative_metric_learning_pb2.py +252 -0
easy_rec/python/protos/custom_model_pb2.py +57 -0
easy_rec/python/protos/dat_pb2.py +262 -0
easy_rec/python/protos/data_source_pb2.py +422 -0
easy_rec/python/protos/dataset_pb2.py +1920 -0
easy_rec/python/protos/dbmtl_pb2.py +191 -0
easy_rec/python/protos/dcn_pb2.py +197 -0
easy_rec/python/protos/deepfm_pb2.py +163 -0
easy_rec/python/protos/dlrm_pb2.py +163 -0
easy_rec/python/protos/dnn_pb2.py +329 -0
easy_rec/python/protos/dropoutnet_pb2.py +239 -0
easy_rec/python/protos/dssm_pb2.py +262 -0
easy_rec/python/protos/dssm_senet_pb2.py +282 -0
easy_rec/python/protos/easy_rec_model_pb2.py +1672 -0
easy_rec/python/protos/esmm_pb2.py +133 -0
easy_rec/python/protos/eval_pb2.py +930 -0
easy_rec/python/protos/export_pb2.py +379 -0
easy_rec/python/protos/feature_config_pb2.py +1359 -0
easy_rec/python/protos/fm_pb2.py +90 -0
easy_rec/python/protos/hive_config_pb2.py +138 -0
easy_rec/python/protos/hyperparams_pb2.py +624 -0
easy_rec/python/protos/keras_layer_pb2.py +692 -0
easy_rec/python/protos/layer_pb2.py +1936 -0
easy_rec/python/protos/loss_pb2.py +1713 -0
easy_rec/python/protos/mind_pb2.py +497 -0
easy_rec/python/protos/mmoe_pb2.py +215 -0
easy_rec/python/protos/multi_tower_pb2.py +295 -0
easy_rec/python/protos/multi_tower_recall_pb2.py +198 -0
easy_rec/python/protos/optimizer_pb2.py +2017 -0
easy_rec/python/protos/pdn_pb2.py +293 -0
easy_rec/python/protos/pipeline_pb2.py +516 -0
easy_rec/python/protos/ple_pb2.py +231 -0
easy_rec/python/protos/predict_pb2.py +1140 -0
easy_rec/python/protos/rocket_launching_pb2.py +169 -0
easy_rec/python/protos/seq_encoder_pb2.py +1084 -0
easy_rec/python/protos/simi_pb2.py +54 -0
easy_rec/python/protos/simple_multi_task_pb2.py +97 -0
easy_rec/python/protos/tf_predict_pb2.py +630 -0
easy_rec/python/protos/tower_pb2.py +661 -0
easy_rec/python/protos/train_pb2.py +1197 -0
easy_rec/python/protos/uniter_pb2.py +307 -0
easy_rec/python/protos/variational_dropout_pb2.py +91 -0
easy_rec/python/protos/wide_and_deep_pb2.py +131 -0
easy_rec/python/test/__init__.py +0 -0
easy_rec/python/test/csv_input_test.py +340 -0
easy_rec/python/test/custom_early_stop_func.py +19 -0
easy_rec/python/test/dh_local_run.py +104 -0
easy_rec/python/test/embed_test.py +155 -0
easy_rec/python/test/emr_run.py +119 -0
easy_rec/python/test/eval_metric_test.py +107 -0
easy_rec/python/test/excel_convert_test.py +64 -0
easy_rec/python/test/export_test.py +513 -0
easy_rec/python/test/fg_test.py +70 -0
easy_rec/python/test/hive_input_test.py +311 -0
easy_rec/python/test/hpo_test.py +235 -0
easy_rec/python/test/kafka_test.py +373 -0
easy_rec/python/test/local_incr_test.py +122 -0
easy_rec/python/test/loss_test.py +110 -0
easy_rec/python/test/odps_command.py +61 -0
easy_rec/python/test/odps_local_run.py +86 -0
easy_rec/python/test/odps_run.py +254 -0
easy_rec/python/test/odps_test_cls.py +39 -0
easy_rec/python/test/odps_test_prepare.py +198 -0
easy_rec/python/test/odps_test_util.py +237 -0
easy_rec/python/test/pre_check_test.py +54 -0
easy_rec/python/test/predictor_test.py +394 -0
easy_rec/python/test/rtp_convert_test.py +133 -0
easy_rec/python/test/run.py +138 -0
easy_rec/python/test/train_eval_test.py +1299 -0
easy_rec/python/test/util_test.py +85 -0
easy_rec/python/test/zero_inflated_lognormal_test.py +53 -0
easy_rec/python/tools/__init__.py +0 -0
easy_rec/python/tools/add_boundaries_to_config.py +67 -0
easy_rec/python/tools/add_feature_info_to_config.py +145 -0
easy_rec/python/tools/convert_config_format.py +48 -0
easy_rec/python/tools/convert_rtp_data.py +79 -0
easy_rec/python/tools/convert_rtp_fg.py +106 -0
easy_rec/python/tools/create_config_from_excel.py +427 -0
easy_rec/python/tools/criteo/__init__.py +0 -0
easy_rec/python/tools/criteo/convert_data.py +157 -0
easy_rec/python/tools/edit_lookup_graph.py +134 -0
easy_rec/python/tools/faiss_index_pai.py +116 -0
easy_rec/python/tools/feature_selection.py +316 -0
easy_rec/python/tools/hit_rate_ds.py +223 -0
easy_rec/python/tools/hit_rate_pai.py +138 -0
easy_rec/python/tools/pre_check.py +120 -0
easy_rec/python/tools/predict_and_chk.py +111 -0
easy_rec/python/tools/read_kafka.py +55 -0
easy_rec/python/tools/split_model_pai.py +286 -0
easy_rec/python/tools/split_pdn_model_pai.py +272 -0
easy_rec/python/tools/test_saved_model.py +80 -0
easy_rec/python/tools/view_saved_model.py +39 -0
easy_rec/python/tools/write_kafka.py +65 -0
easy_rec/python/train_eval.py +325 -0
easy_rec/python/utils/__init__.py +15 -0
easy_rec/python/utils/activation.py +120 -0
easy_rec/python/utils/check_utils.py +87 -0
easy_rec/python/utils/compat.py +14 -0
easy_rec/python/utils/config_util.py +652 -0
easy_rec/python/utils/constant.py +43 -0
easy_rec/python/utils/convert_rtp_fg.py +616 -0
easy_rec/python/utils/dag.py +192 -0
easy_rec/python/utils/distribution_utils.py +268 -0
easy_rec/python/utils/ds_util.py +65 -0
easy_rec/python/utils/embedding_utils.py +73 -0
easy_rec/python/utils/estimator_utils.py +1036 -0
easy_rec/python/utils/export_big_model.py +630 -0
easy_rec/python/utils/expr_util.py +118 -0
easy_rec/python/utils/fg_util.py +53 -0
easy_rec/python/utils/hit_rate_utils.py +220 -0
easy_rec/python/utils/hive_utils.py +183 -0
easy_rec/python/utils/hpo_util.py +137 -0
easy_rec/python/utils/hvd_utils.py +56 -0
easy_rec/python/utils/input_utils.py +108 -0
easy_rec/python/utils/io_util.py +282 -0
easy_rec/python/utils/load_class.py +249 -0
easy_rec/python/utils/meta_graph_editor.py +941 -0
easy_rec/python/utils/multi_optimizer.py +62 -0
easy_rec/python/utils/numpy_utils.py +18 -0
easy_rec/python/utils/odps_util.py +79 -0
easy_rec/python/utils/pai_util.py +86 -0
easy_rec/python/utils/proto_util.py +90 -0
easy_rec/python/utils/restore_filter.py +89 -0
easy_rec/python/utils/shape_utils.py +432 -0
easy_rec/python/utils/static_shape.py +71 -0
easy_rec/python/utils/test_utils.py +866 -0
easy_rec/python/utils/tf_utils.py +56 -0
easy_rec/version.py +4 -0
test/__init__.py +0 -0

easy_rec/python/input/parquet_input_v3.py ADDED Viewed

@@ -0,0 +1,203 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import tensorflow as tf
+from easy_rec.python.input.input import Input
+from easy_rec.python.utils.input_utils import get_type_defaults
+try:
+  from tensorflow.python.data.experimental.ops import parquet_dataset_ops
+  from tensorflow.python.data.experimental.ops import parquet_pybind
+  from tensorflow.python.data.experimental.ops import dataframe
+  from tensorflow.python.ops import gen_ragged_conversion_ops
+  from tensorflow.python.ops.work_queue import WorkQueue
+  _has_deep_rec = True
+except Exception:
+  _has_deep_rec = False
+  pass
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+class ParquetInputV3(Input):
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None,
+               **kwargs):
+    if not _has_deep_rec:
+      raise RuntimeError('You should install DeepRec first.')
+    super(ParquetInputV3,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config)
+    self._ignore_val_dict = {}
+    for f in data_config.input_fields:
+      if f.HasField('ignore_val'):
+        self._ignore_val_dict[f.input_name] = get_type_defaults(
+            f.input_type, f.ignore_val)
+    self._true_type_dict = {}
+    for fc in self._feature_configs:
+      if fc.feature_type in [fc.IdFeature, fc.TagFeature, fc.SequenceFeature]:
+        if fc.hash_bucket_size > 0 or len(
+            fc.vocab_list) > 0 or fc.HasField('vocab_file'):
+          self._true_type_dict[fc.input_names[0]] = tf.string
+        else:
+          self._true_type_dict[fc.input_names[0]] = tf.int64
+        if len(fc.input_names) > 1:
+          self._true_type_dict[fc.input_names[1]] = tf.float32
+      if fc.feature_type == fc.RawFeature:
+        self._true_type_dict[fc.input_names[0]] = tf.float32
+    self._reserve_fields = None
+    self._reserve_types = None
+    if 'reserve_fields' in kwargs and 'reserve_types' in kwargs:
+      self._reserve_fields = kwargs['reserve_fields']
+      self._reserve_types = kwargs['reserve_types']
+    # In ParquetDataset multi_value use input type
+    self._multi_value_types = {}
+  def _ignore_and_cast(self, name, value):
+    ignore_value = self._ignore_val_dict.get(name, None)
+    if ignore_value:
+      if isinstance(value, tf.SparseTensor):
+        indices = tf.where(tf.equal(value.values, ignore_value))
+        value = tf.SparseTensor(
+            tf.gather_nd(value.indices, indices),
+            tf.gather_nd(value.values, indices), value.dense_shape)
+      elif isinstance(value, tf.Tensor):
+        indices = tf.where(tf.not_equal(value, ignore_value), name='indices')
+        value = tf.SparseTensor(
+            indices=indices,
+            values=tf.gather_nd(value, indices),
+            dense_shape=tf.shape(value, out_type=tf.int64))
+    dtype = self._true_type_dict.get(name, None)
+    if dtype:
+      value = tf.cast(value, dtype)
+    return value
+  def _parse_dataframe_value(self, value):
+    if len(value.nested_row_splits) == 0:
+      return value.values
+    value.values.set_shape([None])
+    sparse_value = gen_ragged_conversion_ops.ragged_tensor_to_sparse(
+        value.nested_row_splits, value.values)
+    return tf.SparseTensor(sparse_value.sparse_indices,
+                           sparse_value.sparse_values,
+                           sparse_value.sparse_dense_shape)
+  def _parse_dataframe(self, df):
+    inputs = {}
+    for k, v in df.items():
+      if k in self._effective_fields:
+        if isinstance(v, dataframe.DataFrame.Value):
+          v = self._parse_dataframe_value(v)
+      elif k in self._label_fields:
+        if isinstance(v, dataframe.DataFrame.Value):
+          v = v.values
+      elif k in self._reserve_fields:
+        if isinstance(v, dataframe.DataFrame.Value):
+          v = v.values
+      else:
+        continue
+      inputs[k] = v
+    return inputs
+  def _build(self, mode, params):
+    input_files = []
+    for sub_path in self._input_path.strip().split(','):
+      input_files.extend(tf.gfile.Glob(sub_path))
+    file_num = len(input_files)
+    logging.info('[task_index=%d] total_file_num=%d task_num=%d' %
+                 (self._task_index, file_num, self._task_num))
+    task_index = self._task_index
+    task_num = self._task_num
+    if self._data_config.chief_redundant:
+      task_index = max(self._task_index - 1, 0)
+      task_num = max(self._task_num - 1, 1)
+    if self._data_config.pai_worker_queue and \
+        mode == tf.estimator.ModeKeys.TRAIN:
+      work_queue = WorkQueue(
+          input_files,
+          num_epochs=self.num_epochs,
+          shuffle=self._data_config.shuffle)
+      my_files = work_queue.input_dataset()
+    else:
+      my_files = []
+      for file_id in range(file_num):
+        if (file_id % task_num) == task_index:
+          my_files.append(input_files[file_id])
+    parquet_fields = parquet_pybind.parquet_fields(input_files[0])
+    parquet_input_fields = []
+    for f in parquet_fields:
+      if f.name in self._input_fields:
+        parquet_input_fields.append(f)
+    all_fields = set(self._effective_fields)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      all_fields |= set(self._label_fields)
+    if self._reserve_fields:
+      all_fields |= set(self._reserve_fields)
+    selected_fields = []
+    for f in parquet_input_fields:
+      if f.name in all_fields:
+        selected_fields.append(f)
+    num_parallel_reads = min(self._data_config.num_parallel_calls,
+                             len(input_files) // task_num)
+    dataset = parquet_dataset_ops.ParquetDataset(
+        my_files,
+        batch_size=self._batch_size,
+        fields=selected_fields,
+        drop_remainder=self._data_config.drop_remainder,
+        num_parallel_reads=num_parallel_reads)
+    # partition_count=task_num,
+    # partition_index=task_index)
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      if self._data_config.shuffle:
+        dataset = dataset.shuffle(
+            self._data_config.shuffle_buffer_size,
+            seed=2020,
+            reshuffle_each_iteration=True)
+      dataset = dataset.repeat(self.num_epochs)
+    else:
+      dataset = dataset.repeat(1)
+    dataset = dataset.map(
+        self._parse_dataframe,
+        num_parallel_calls=self._data_config.num_parallel_calls)
+    # preprocess is necessary to transform data
+    # so that they could be feed into FeatureColumns
+    dataset = dataset.map(
+        map_func=self._preprocess,
+        num_parallel_calls=self._data_config.num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      dataset = dataset.map(lambda x:
+                            (self._get_features(x), self._get_labels(x)))
+    else:
+      dataset = dataset.map(lambda x: (self._get_features(x)))
+    return dataset
+  def _preprocess(self, field_dict):
+    for k, v in field_dict.items():
+      field_dict[k] = self._ignore_and_cast(k, v)
+    return super(ParquetInputV3, self)._preprocess(field_dict)

easy_rec/python/input/rtp_input.py ADDED Viewed

@@ -0,0 +1,225 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import tensorflow as tf
+from easy_rec.python.input.input import Input
+from easy_rec.python.ops.gen_str_avx_op import str_split_by_chr
+from easy_rec.python.utils.check_utils import check_split
+from easy_rec.python.utils.check_utils import check_string_to_number
+from easy_rec.python.utils.input_utils import string_to_number
+from easy_rec.python.utils.tf_utils import get_tf_type
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+class RTPInput(Input):
+  """RTPInput for parsing rtp fg new input format.
+  Our new format(csv in csv) of rtp output:
+     label0, item_id, ..., user_id, features
+  here the separator(,) could be specified by data_config.rtp_separator
+  For the feature column, features are separated by ,
+     multiple values of one feature are separated by , such as:
+     ...20beautysmartParis...
+  The features column and labels are specified by data_config.selected_cols,
+     columns are selected by indices as our csv file has no header,
+     such as: 0,1,4, means the 4th column is features, the 1st and 2nd
+     columns are labels
+  """
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None):
+    super(RTPInput,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config)
+    logging.info('input_fields: %s label_fields: %s' %
+                 (','.join(self._input_fields), ','.join(self._label_fields)))
+    self._rtp_separator = self._data_config.rtp_separator
+    if not isinstance(self._rtp_separator, str):
+      self._rtp_separator = self._rtp_separator.encode('utf-8')
+    self._selected_cols = [
+        int(x) for x in self._data_config.selected_cols.split(',')
+    ]
+    self._num_cols = -1
+    self._feature_col_id = self._selected_cols[-1]
+    logging.info('rtp separator = %s' % self._rtp_separator)
+  def _parse_csv(self, line):
+    record_defaults = ['' for i in range(self._num_cols)]
+    # the actual features are in one single column
+    record_defaults[self._feature_col_id] = self._data_config.separator.join([
+        str(self.get_type_defaults(t, v))
+        for x, t, v in zip(self._input_fields, self._input_field_types,
+                           self._input_field_defaults)
+        if x not in self._label_fields
+    ])
+    check_list = [
+        tf.py_func(
+            check_split, [line, self._rtp_separator,
+                          len(record_defaults)],
+            Tout=tf.bool)
+    ] if self._check_mode else []
+    with tf.control_dependencies(check_list):
+      fields = tf.string_split(line, self._rtp_separator, skip_empty=False)
+    fields = tf.reshape(fields.values, [-1, len(record_defaults)])
+    labels = []
+    for idx, x in enumerate(self._selected_cols[:-1]):
+      field = fields[:, x]
+      fname = self._input_fields[idx]
+      ftype = self._input_field_types[idx]
+      tf_type = get_tf_type(ftype)
+      if field.dtype in [tf.string]:
+        check_list = [
+            tf.py_func(check_string_to_number, [field, fname], Tout=tf.bool)
+        ] if self._check_mode else []
+        with tf.control_dependencies(check_list):
+          field = tf.string_to_number(field, tf_type)
+      labels.append(field)
+    # only for features, labels excluded
+    record_types = [
+        t for x, t in zip(self._input_fields, self._input_field_types)
+        if x not in self._label_fields
+    ]
+    # assume that the last field is the generated feature column
+    print('field_delim = %s' % self._data_config.separator)
+    feature_str = fields[:, self._feature_col_id]
+    check_list = [
+        tf.py_func(
+            check_split,
+            [feature_str, self._data_config.separator,
+             len(record_types)],
+            Tout=tf.bool)
+    ] if self._check_mode else []
+    with tf.control_dependencies(check_list):
+      fields = str_split_by_chr(
+          feature_str, self._data_config.separator, skip_empty=False)
+    tmp_fields = tf.reshape(fields.values, [-1, len(record_types)])
+    rtp_record_defaults = [
+        str(self.get_type_defaults(t, v))
+        for x, t, v in zip(self._input_fields, self._input_field_types,
+                           self._input_field_defaults)
+        if x not in self._label_fields
+    ]
+    fields = []
+    for i in range(len(record_types)):
+      field = string_to_number(tmp_fields[:, i], record_types[i],
+                               rtp_record_defaults[i], i)
+      fields.append(field)
+    field_keys = [x for x in self._input_fields if x not in self._label_fields]
+    effective_fids = [field_keys.index(x) for x in self._effective_fields]
+    inputs = {field_keys[x]: fields[x] for x in effective_fids}
+    for x in range(len(self._label_fields)):
+      inputs[self._label_fields[x]] = labels[x]
+    return inputs
+  def _build(self, mode, params):
+    if type(self._input_path) != list:
+      self._input_path = self._input_path.split(',')
+    file_paths = []
+    for x in self._input_path:
+      file_paths.extend(tf.gfile.Glob(x))
+    assert len(file_paths) > 0, 'match no files with %s' % self._input_path
+    # try to figure out number of fields from one file
+    with tf.gfile.GFile(file_paths[0], 'r') as fin:
+      num_lines = 0
+      for line_str in fin:
+        line_tok = line_str.strip().split(self._rtp_separator)
+        if self._num_cols != -1:
+          assert self._num_cols == len(line_tok), \
+              'num selected cols is %d, not equal to %d, current line is: %s, please check rtp_separator and data.' % \
+              (self._num_cols, len(line_tok), line_str)
+        self._num_cols = len(line_tok)
+        num_lines += 1
+        if num_lines > 10:
+          break
+    logging.info('num selected cols = %d' % self._num_cols)
+    record_defaults = [
+        self.get_type_defaults(t, v)
+        for x, t, v in zip(self._input_fields, self._input_field_types,
+                           self._input_field_defaults)
+        if x in self._label_fields
+    ]
+    # the features are in one single column
+    record_defaults.append(
+        self._data_config.separator.join([
+            str(self.get_type_defaults(t, v))
+            for x, t, v in zip(self._input_fields, self._input_field_types,
+                               self._input_field_defaults)
+            if x not in self._label_fields
+        ]))
+    num_parallel_calls = self._data_config.num_parallel_calls
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      logging.info('train files[%d]: %s' %
+                   (len(file_paths), ','.join(file_paths)))
+      dataset = tf.data.Dataset.from_tensor_slices(file_paths)
+      if self._data_config.file_shard:
+        dataset = self._safe_shard(dataset)
+      if self._data_config.shuffle:
+        # shuffle input files
+        dataset = dataset.shuffle(len(file_paths))
+      # too many readers read the same file will cause performance issues
+      # as the same data will be read multiple times
+      parallel_num = min(num_parallel_calls, len(file_paths))
+      dataset = dataset.interleave(
+          tf.data.TextLineDataset,
+          cycle_length=parallel_num,
+          num_parallel_calls=parallel_num)
+      if not self._data_config.file_shard:
+        dataset = self._safe_shard(dataset)
+      if self._data_config.shuffle:
+        dataset = dataset.shuffle(
+            self._data_config.shuffle_buffer_size,
+            seed=2020,
+            reshuffle_each_iteration=True)
+      dataset = dataset.repeat(self.num_epochs)
+    else:
+      logging.info('eval files[%d]: %s' %
+                   (len(file_paths), ','.join(file_paths)))
+      dataset = tf.data.TextLineDataset(file_paths)
+      dataset = dataset.repeat(1)
+    dataset = dataset.batch(batch_size=self._data_config.batch_size)
+    dataset = dataset.map(
+        self._parse_csv,
+        num_parallel_calls=self._data_config.num_parallel_calls)
+    # preprocess is necessary to transform data
+    # so that they could be feed into FeatureColumns
+    dataset = dataset.map(
+        map_func=self._preprocess,
+        num_parallel_calls=self._data_config.num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      dataset = dataset.map(lambda x:
+                            (self._get_features(x), self._get_labels(x)))
+    else:
+      dataset = dataset.map(lambda x: (self._get_features(x)))
+    return dataset

easy_rec/python/input/rtp_input_v2.py ADDED Viewed

@@ -0,0 +1,145 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import tensorflow as tf
+from easy_rec.python.input.input import Input
+from easy_rec.python.protos.dataset_pb2 import DatasetConfig
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+class RTPInputV2(Input):
+  """RTPInput for parsing rtp fg input format.
+  the original rtp format, it is not efficient for training, the performance have to be tuned.
+  """
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None):
+    super(RTPInputV2,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config)
+  def _parse_rtp(self, lines):
+    tf_types = [tf.string for x in self._input_field_types]
+    def _parse_one_line_tf(line):
+      line = tf.expand_dims(line, axis=0)
+      field_toks = tf.string_split(line, '\002').values
+      field_vals = tf.string_split(field_toks, '\003').values
+      field_vals = tf.reshape(field_vals, [-1, 2])
+      keys = field_vals[:, 0]
+      vals = field_vals[:, 1]
+      temp_vals = [
+          str(
+              self.get_type_defaults(self._input_field_types[i],
+                                     self._input_field_defaults[i]))
+          for i in range(len(self._input_fields))
+      ]
+      for i, key in enumerate(self._input_fields):
+        msk = tf.equal(key, keys)
+        val = tf.boolean_mask(vals, msk)
+        def_val = self.get_type_defaults(self._input_field_types[i],
+                                         self._input_field_defaults[i])
+        temp_vals[i] = tf.cond(
+            tf.reduce_any(msk), lambda: tf.reduce_join(val, separator=','),
+            lambda: tf.constant(str(def_val)))
+      return temp_vals
+    fields = tf.map_fn(
+        _parse_one_line_tf,
+        lines,
+        tf_types,
+        parallel_iterations=64,
+        name='parse_one_line_tf_map_fn')
+    def _convert(x, target_type, name):
+      if target_type in [DatasetConfig.FLOAT, DatasetConfig.DOUBLE]:
+        return tf.string_to_number(
+            x, tf.float32, name='convert_input_flt32/%s' % name)
+      elif target_type == DatasetConfig.INT32:
+        return tf.string_to_number(
+            x, tf.int32, name='convert_input_int32/%s' % name)
+      elif target_type == DatasetConfig.INT64:
+        return tf.string_to_number(
+            x, tf.int64, name='convert_input_int64/%s' % name)
+      return x
+    inputs = {
+        self._input_fields[x]: _convert(fields[x], self._input_field_types[x],
+                                        self._input_fields[x])
+        for x in self._effective_fids
+    }
+    for x in self._label_fids:
+      inputs[self._input_fields[x]] = fields[x]
+    return inputs
+  def _build(self, mode, params):
+    if type(self._input_path) != list:
+      self._input_path = self._input_path.split(',')
+    file_paths = []
+    for x in self._input_path:
+      file_paths.extend(tf.gfile.Glob(x))
+    assert len(file_paths) > 0, 'match no files with %s' % self._input_path
+    num_parallel_calls = self._data_config.num_parallel_calls
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      logging.info('train files[%d]: %s' %
+                   (len(file_paths), ','.join(file_paths)))
+      dataset = tf.data.Dataset.from_tensor_slices(file_paths)
+      if self._data_config.file_shard:
+        dataset = self._safe_shard(dataset)
+      if self._data_config.shuffle:
+        # shuffle input files
+        dataset = dataset.shuffle(len(file_paths))
+      # too many readers read the same file will cause performance issues
+      # as the same data will be read multiple times
+      parallel_num = min(num_parallel_calls, len(file_paths))
+      dataset = dataset.interleave(
+          tf.data.TextLineDataset,
+          cycle_length=parallel_num,
+          num_parallel_calls=parallel_num)
+      if not self._data_config.file_shard:
+        dataset = self._safe_shard(dataset)
+      if self._data_config.shuffle:
+        dataset = dataset.shuffle(
+            self._data_config.shuffle_buffer_size,
+            seed=2020,
+            reshuffle_each_iteration=True)
+      dataset = dataset.repeat(self.num_epochs)
+    else:
+      logging.info('eval files[%d]: %s' %
+                   (len(file_paths), ','.join(file_paths)))
+      dataset = tf.data.TextLineDataset(file_paths)
+      dataset = dataset.repeat(1)
+    dataset = dataset.batch(self._data_config.batch_size)
+    dataset = dataset.map(
+        self._parse_rtp, num_parallel_calls=num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    dataset = dataset.map(
+        map_func=self._preprocess, num_parallel_calls=num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      dataset = dataset.map(lambda x:
+                            (self._get_features(x), self._get_labels(x)))
+    else:
+      dataset = dataset.map(lambda x: (self._get_features(x)))
+    return dataset

easy_rec/python/input/tfrecord_input.py ADDED Viewed

@@ -0,0 +1,100 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import tensorflow as tf
+from easy_rec.python.input.input import Input
+from easy_rec.python.utils.tf_utils import get_tf_type
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+class TFRecordInput(Input):
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None):
+    super(TFRecordInput,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config)
+    self.feature_desc = {}
+    for x, t, d, s in zip(self._input_fields, self._input_field_types,
+                          self._input_field_defaults, self._input_dims):
+      d = self.get_type_defaults(t, d)
+      t = get_tf_type(t)
+      if s == 1:
+        self.feature_desc[x] = tf.FixedLenFeature(
+            dtype=t, shape=[s], default_value=d)
+      else:
+        self.feature_desc[x] = tf.FixedLenFeature(
+            dtype=t, shape=[s], default_value=[d] * s)
+  def _parse_tfrecord(self, example):
+    try:
+      inputs = tf.parse_single_example(example, features=self.feature_desc)
+    except AttributeError:
+      inputs = tf.io.parse_single_example(example, features=self.feature_desc)
+    return inputs
+  def _build(self, mode, params):
+    if type(self._input_path) != list:
+      self._input_path = self._input_path.split(',')
+    file_paths = []
+    for x in self._input_path:
+      file_paths.extend(tf.gfile.Glob(x))
+    assert len(file_paths) > 0, 'match no files with %s' % self._input_path
+    num_parallel_calls = self._data_config.num_parallel_calls
+    data_compression_type = self._data_config.data_compression_type
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      logging.info('train files[%d]: %s' %
+                   (len(file_paths), ','.join(file_paths)))
+      dataset = tf.data.Dataset.from_tensor_slices(file_paths)
+      if self._data_config.shuffle:
+        # shuffle input files
+        dataset = dataset.shuffle(len(file_paths))
+      # too many readers read the same file will cause performance issues
+      # as the same data will be read multiple times
+      parallel_num = min(num_parallel_calls, len(file_paths))
+      dataset = dataset.interleave(
+          lambda x: tf.data.TFRecordDataset(
+              x, compression_type=data_compression_type),
+          cycle_length=parallel_num,
+          num_parallel_calls=parallel_num)
+      dataset = dataset.shard(self._task_num, self._task_index)
+      if self._data_config.shuffle:
+        dataset = dataset.shuffle(
+            self._data_config.shuffle_buffer_size,
+            seed=2020,
+            reshuffle_each_iteration=True)
+      dataset = dataset.repeat(self.num_epochs)
+    else:
+      logging.info('eval files[%d]: %s' %
+                   (len(file_paths), ','.join(file_paths)))
+      dataset = tf.data.TFRecordDataset(
+          file_paths, compression_type=data_compression_type)
+      dataset = dataset.repeat(1)
+    dataset = dataset.map(
+        self._parse_tfrecord, num_parallel_calls=num_parallel_calls)
+    dataset = dataset.batch(self._data_config.batch_size)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    dataset = dataset.map(
+        map_func=self._preprocess, num_parallel_calls=num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      dataset = dataset.map(lambda x:
+                            (self._get_features(x), self._get_labels(x)))
+    else:
+      dataset = dataset.map(lambda x: (self._get_features(x)))
+    return dataset

easy_rec/python/layers/__init__.py ADDED Viewed

File without changes