PyPI - easy-cs-rec-custommodel - Versions diffs - 0.8.6__py2.py3-none-any.whl - Mend

easy-cs-rec-custommodel 0.8.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of easy-cs-rec-custommodel might be problematic. Click here for more details.

Files changed (336) hide show

easy_cs_rec_custommodel-0.8.6.dist-info/LICENSE +203 -0
easy_cs_rec_custommodel-0.8.6.dist-info/METADATA +48 -0
easy_cs_rec_custommodel-0.8.6.dist-info/RECORD +336 -0
easy_cs_rec_custommodel-0.8.6.dist-info/WHEEL +6 -0
easy_cs_rec_custommodel-0.8.6.dist-info/top_level.txt +2 -0
easy_rec/__init__.py +114 -0
easy_rec/python/__init__.py +0 -0
easy_rec/python/builders/__init__.py +0 -0
easy_rec/python/builders/hyperparams_builder.py +78 -0
easy_rec/python/builders/loss_builder.py +333 -0
easy_rec/python/builders/optimizer_builder.py +211 -0
easy_rec/python/builders/strategy_builder.py +44 -0
easy_rec/python/compat/__init__.py +0 -0
easy_rec/python/compat/adam_s.py +245 -0
easy_rec/python/compat/array_ops.py +229 -0
easy_rec/python/compat/dynamic_variable.py +542 -0
easy_rec/python/compat/early_stopping.py +653 -0
easy_rec/python/compat/embedding_ops.py +162 -0
easy_rec/python/compat/embedding_parallel_saver.py +316 -0
easy_rec/python/compat/estimator_train.py +116 -0
easy_rec/python/compat/exporter.py +473 -0
easy_rec/python/compat/feature_column/__init__.py +0 -0
easy_rec/python/compat/feature_column/feature_column.py +3675 -0
easy_rec/python/compat/feature_column/feature_column_v2.py +5233 -0
easy_rec/python/compat/feature_column/sequence_feature_column.py +648 -0
easy_rec/python/compat/feature_column/utils.py +154 -0
easy_rec/python/compat/layers.py +329 -0
easy_rec/python/compat/ops.py +14 -0
easy_rec/python/compat/optimizers.py +619 -0
easy_rec/python/compat/queues.py +311 -0
easy_rec/python/compat/regularizers.py +208 -0
easy_rec/python/compat/sok_optimizer.py +440 -0
easy_rec/python/compat/sync_replicas_optimizer.py +528 -0
easy_rec/python/compat/weight_decay_optimizers.py +475 -0
easy_rec/python/core/__init__.py +0 -0
easy_rec/python/core/easyrec_metrics/__init__.py +24 -0
easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_pai.py +3702 -0
easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_tf.py +3768 -0
easy_rec/python/core/learning_schedules.py +228 -0
easy_rec/python/core/metrics.py +402 -0
easy_rec/python/core/sampler.py +844 -0
easy_rec/python/eval.py +102 -0
easy_rec/python/export.py +150 -0
easy_rec/python/feature_column/__init__.py +0 -0
easy_rec/python/feature_column/feature_column.py +664 -0
easy_rec/python/feature_column/feature_group.py +89 -0
easy_rec/python/hpo/__init__.py +0 -0
easy_rec/python/hpo/emr_hpo.py +140 -0
easy_rec/python/hpo/generate_hpo_sql.py +71 -0
easy_rec/python/hpo/pai_hpo.py +297 -0
easy_rec/python/inference/__init__.py +0 -0
easy_rec/python/inference/csv_predictor.py +189 -0
easy_rec/python/inference/hive_parquet_predictor.py +200 -0
easy_rec/python/inference/hive_predictor.py +166 -0
easy_rec/python/inference/odps_predictor.py +70 -0
easy_rec/python/inference/parquet_predictor.py +147 -0
easy_rec/python/inference/parquet_predictor_v2.py +147 -0
easy_rec/python/inference/predictor.py +621 -0
easy_rec/python/inference/processor/__init__.py +0 -0
easy_rec/python/inference/processor/test.py +170 -0
easy_rec/python/inference/vector_retrieve.py +124 -0
easy_rec/python/input/__init__.py +0 -0
easy_rec/python/input/batch_tfrecord_input.py +117 -0
easy_rec/python/input/criteo_binary_reader.py +259 -0
easy_rec/python/input/criteo_input.py +107 -0
easy_rec/python/input/csv_input.py +175 -0
easy_rec/python/input/csv_input_ex.py +72 -0
easy_rec/python/input/csv_input_v2.py +68 -0
easy_rec/python/input/datahub_input.py +320 -0
easy_rec/python/input/dummy_input.py +58 -0
easy_rec/python/input/hive_input.py +123 -0
easy_rec/python/input/hive_parquet_input.py +140 -0
easy_rec/python/input/hive_rtp_input.py +174 -0
easy_rec/python/input/input.py +1064 -0
easy_rec/python/input/kafka_dataset.py +144 -0
easy_rec/python/input/kafka_input.py +235 -0
easy_rec/python/input/load_parquet.py +317 -0
easy_rec/python/input/odps_input.py +101 -0
easy_rec/python/input/odps_input_v2.py +110 -0
easy_rec/python/input/odps_input_v3.py +132 -0
easy_rec/python/input/odps_rtp_input.py +187 -0
easy_rec/python/input/odps_rtp_input_v2.py +104 -0
easy_rec/python/input/parquet_input.py +397 -0
easy_rec/python/input/parquet_input_v2.py +180 -0
easy_rec/python/input/parquet_input_v3.py +203 -0
easy_rec/python/input/rtp_input.py +225 -0
easy_rec/python/input/rtp_input_v2.py +145 -0
easy_rec/python/input/tfrecord_input.py +100 -0
easy_rec/python/layers/__init__.py +0 -0
easy_rec/python/layers/backbone.py +571 -0
easy_rec/python/layers/capsule_layer.py +176 -0
easy_rec/python/layers/cmbf.py +390 -0
easy_rec/python/layers/common_layers.py +192 -0
easy_rec/python/layers/dnn.py +87 -0
easy_rec/python/layers/embed_input_layer.py +25 -0
easy_rec/python/layers/fm.py +26 -0
easy_rec/python/layers/input_layer.py +396 -0
easy_rec/python/layers/keras/__init__.py +34 -0
easy_rec/python/layers/keras/activation.py +114 -0
easy_rec/python/layers/keras/attention.py +267 -0
easy_rec/python/layers/keras/auxiliary_loss.py +47 -0
easy_rec/python/layers/keras/blocks.py +262 -0
easy_rec/python/layers/keras/bst.py +119 -0
easy_rec/python/layers/keras/custom_ops.py +250 -0
easy_rec/python/layers/keras/data_augment.py +133 -0
easy_rec/python/layers/keras/din.py +67 -0
easy_rec/python/layers/keras/einsum_dense.py +598 -0
easy_rec/python/layers/keras/embedding.py +81 -0
easy_rec/python/layers/keras/fibinet.py +251 -0
easy_rec/python/layers/keras/interaction.py +416 -0
easy_rec/python/layers/keras/layer_norm.py +364 -0
easy_rec/python/layers/keras/mask_net.py +166 -0
easy_rec/python/layers/keras/multi_head_attention.py +717 -0
easy_rec/python/layers/keras/multi_task.py +125 -0
easy_rec/python/layers/keras/numerical_embedding.py +376 -0
easy_rec/python/layers/keras/ppnet.py +194 -0
easy_rec/python/layers/keras/transformer.py +192 -0
easy_rec/python/layers/layer_norm.py +51 -0
easy_rec/python/layers/mmoe.py +83 -0
easy_rec/python/layers/multihead_attention.py +162 -0
easy_rec/python/layers/multihead_cross_attention.py +749 -0
easy_rec/python/layers/senet.py +73 -0
easy_rec/python/layers/seq_input_layer.py +134 -0
easy_rec/python/layers/sequence_feature_layer.py +249 -0
easy_rec/python/layers/uniter.py +301 -0
easy_rec/python/layers/utils.py +248 -0
easy_rec/python/layers/variational_dropout_layer.py +130 -0
easy_rec/python/loss/__init__.py +0 -0
easy_rec/python/loss/circle_loss.py +82 -0
easy_rec/python/loss/contrastive_loss.py +79 -0
easy_rec/python/loss/f1_reweight_loss.py +38 -0
easy_rec/python/loss/focal_loss.py +93 -0
easy_rec/python/loss/jrc_loss.py +128 -0
easy_rec/python/loss/listwise_loss.py +161 -0
easy_rec/python/loss/multi_similarity.py +68 -0
easy_rec/python/loss/pairwise_loss.py +307 -0
easy_rec/python/loss/softmax_loss_with_negative_mining.py +110 -0
easy_rec/python/loss/zero_inflated_lognormal.py +76 -0
easy_rec/python/main.py +878 -0
easy_rec/python/model/__init__.py +0 -0
easy_rec/python/model/autoint.py +73 -0
easy_rec/python/model/cmbf.py +47 -0
easy_rec/python/model/collaborative_metric_learning.py +182 -0
easy_rec/python/model/custom_model.py +323 -0
easy_rec/python/model/dat.py +138 -0
easy_rec/python/model/dbmtl.py +116 -0
easy_rec/python/model/dcn.py +70 -0
easy_rec/python/model/deepfm.py +106 -0
easy_rec/python/model/dlrm.py +73 -0
easy_rec/python/model/dropoutnet.py +207 -0
easy_rec/python/model/dssm.py +154 -0
easy_rec/python/model/dssm_senet.py +143 -0
easy_rec/python/model/dummy_model.py +48 -0
easy_rec/python/model/easy_rec_estimator.py +739 -0
easy_rec/python/model/easy_rec_model.py +467 -0
easy_rec/python/model/esmm.py +242 -0
easy_rec/python/model/fm.py +63 -0
easy_rec/python/model/match_model.py +357 -0
easy_rec/python/model/mind.py +445 -0
easy_rec/python/model/mmoe.py +70 -0
easy_rec/python/model/multi_task_model.py +303 -0
easy_rec/python/model/multi_tower.py +62 -0
easy_rec/python/model/multi_tower_bst.py +190 -0
easy_rec/python/model/multi_tower_din.py +130 -0
easy_rec/python/model/multi_tower_recall.py +68 -0
easy_rec/python/model/pdn.py +203 -0
easy_rec/python/model/ple.py +120 -0
easy_rec/python/model/rank_model.py +485 -0
easy_rec/python/model/rocket_launching.py +203 -0
easy_rec/python/model/simple_multi_task.py +54 -0
easy_rec/python/model/uniter.py +46 -0
easy_rec/python/model/wide_and_deep.py +121 -0
easy_rec/python/ops/1.12/incr_record.so +0 -0
easy_rec/python/ops/1.12/kafka.so +0 -0
easy_rec/python/ops/1.12/libcustom_ops.so +0 -0
easy_rec/python/ops/1.12/libembed_op.so +0 -0
easy_rec/python/ops/1.12/libhiredis.so.1.0.0 +0 -0
easy_rec/python/ops/1.12/librdkafka++.so.1 +0 -0
easy_rec/python/ops/1.12/librdkafka.so.1 +0 -0
easy_rec/python/ops/1.12/libredis++.so +0 -0
easy_rec/python/ops/1.12/libredis++.so.1 +0 -0
easy_rec/python/ops/1.12/libredis++.so.1.2.3 +0 -0
easy_rec/python/ops/1.12/libstr_avx_op.so +0 -0
easy_rec/python/ops/1.12/libwrite_sparse_kv.so +0 -0
easy_rec/python/ops/1.15/incr_record.so +0 -0
easy_rec/python/ops/1.15/kafka.so +0 -0
easy_rec/python/ops/1.15/libcustom_ops.so +0 -0
easy_rec/python/ops/1.15/libembed_op.so +0 -0
easy_rec/python/ops/1.15/libhiredis.so.1.0.0 +0 -0
easy_rec/python/ops/1.15/librdkafka++.so +0 -0
easy_rec/python/ops/1.15/librdkafka++.so.1 +0 -0
easy_rec/python/ops/1.15/librdkafka.so +0 -0
easy_rec/python/ops/1.15/librdkafka.so.1 +0 -0
easy_rec/python/ops/1.15/libredis++.so.1 +0 -0
easy_rec/python/ops/1.15/libstr_avx_op.so +0 -0
easy_rec/python/ops/2.12/libcustom_ops.so +0 -0
easy_rec/python/ops/2.12/libload_embed.so +0 -0
easy_rec/python/ops/2.12/libstr_avx_op.so +0 -0
easy_rec/python/ops/__init__.py +0 -0
easy_rec/python/ops/gen_kafka_ops.py +193 -0
easy_rec/python/ops/gen_str_avx_op.py +28 -0
easy_rec/python/ops/incr_record.py +30 -0
easy_rec/python/predict.py +170 -0
easy_rec/python/protos/__init__.py +0 -0
easy_rec/python/protos/autoint_pb2.py +122 -0
easy_rec/python/protos/backbone_pb2.py +1416 -0
easy_rec/python/protos/cmbf_pb2.py +435 -0
easy_rec/python/protos/collaborative_metric_learning_pb2.py +252 -0
easy_rec/python/protos/custom_model_pb2.py +57 -0
easy_rec/python/protos/dat_pb2.py +262 -0
easy_rec/python/protos/data_source_pb2.py +422 -0
easy_rec/python/protos/dataset_pb2.py +1920 -0
easy_rec/python/protos/dbmtl_pb2.py +191 -0
easy_rec/python/protos/dcn_pb2.py +197 -0
easy_rec/python/protos/deepfm_pb2.py +163 -0
easy_rec/python/protos/dlrm_pb2.py +163 -0
easy_rec/python/protos/dnn_pb2.py +329 -0
easy_rec/python/protos/dropoutnet_pb2.py +239 -0
easy_rec/python/protos/dssm_pb2.py +262 -0
easy_rec/python/protos/dssm_senet_pb2.py +282 -0
easy_rec/python/protos/easy_rec_model_pb2.py +1672 -0
easy_rec/python/protos/esmm_pb2.py +133 -0
easy_rec/python/protos/eval_pb2.py +930 -0
easy_rec/python/protos/export_pb2.py +379 -0
easy_rec/python/protos/feature_config_pb2.py +1359 -0
easy_rec/python/protos/fm_pb2.py +90 -0
easy_rec/python/protos/hive_config_pb2.py +138 -0
easy_rec/python/protos/hyperparams_pb2.py +624 -0
easy_rec/python/protos/keras_layer_pb2.py +692 -0
easy_rec/python/protos/layer_pb2.py +1936 -0
easy_rec/python/protos/loss_pb2.py +1713 -0
easy_rec/python/protos/mind_pb2.py +497 -0
easy_rec/python/protos/mmoe_pb2.py +215 -0
easy_rec/python/protos/multi_tower_pb2.py +295 -0
easy_rec/python/protos/multi_tower_recall_pb2.py +198 -0
easy_rec/python/protos/optimizer_pb2.py +2017 -0
easy_rec/python/protos/pdn_pb2.py +293 -0
easy_rec/python/protos/pipeline_pb2.py +516 -0
easy_rec/python/protos/ple_pb2.py +231 -0
easy_rec/python/protos/predict_pb2.py +1140 -0
easy_rec/python/protos/rocket_launching_pb2.py +169 -0
easy_rec/python/protos/seq_encoder_pb2.py +1084 -0
easy_rec/python/protos/simi_pb2.py +54 -0
easy_rec/python/protos/simple_multi_task_pb2.py +97 -0
easy_rec/python/protos/tf_predict_pb2.py +630 -0
easy_rec/python/protos/tower_pb2.py +661 -0
easy_rec/python/protos/train_pb2.py +1197 -0
easy_rec/python/protos/uniter_pb2.py +307 -0
easy_rec/python/protos/variational_dropout_pb2.py +91 -0
easy_rec/python/protos/wide_and_deep_pb2.py +131 -0
easy_rec/python/test/__init__.py +0 -0
easy_rec/python/test/csv_input_test.py +340 -0
easy_rec/python/test/custom_early_stop_func.py +19 -0
easy_rec/python/test/dh_local_run.py +104 -0
easy_rec/python/test/embed_test.py +155 -0
easy_rec/python/test/emr_run.py +119 -0
easy_rec/python/test/eval_metric_test.py +107 -0
easy_rec/python/test/excel_convert_test.py +64 -0
easy_rec/python/test/export_test.py +513 -0
easy_rec/python/test/fg_test.py +70 -0
easy_rec/python/test/hive_input_test.py +311 -0
easy_rec/python/test/hpo_test.py +235 -0
easy_rec/python/test/kafka_test.py +373 -0
easy_rec/python/test/local_incr_test.py +122 -0
easy_rec/python/test/loss_test.py +110 -0
easy_rec/python/test/odps_command.py +61 -0
easy_rec/python/test/odps_local_run.py +86 -0
easy_rec/python/test/odps_run.py +254 -0
easy_rec/python/test/odps_test_cls.py +39 -0
easy_rec/python/test/odps_test_prepare.py +198 -0
easy_rec/python/test/odps_test_util.py +237 -0
easy_rec/python/test/pre_check_test.py +54 -0
easy_rec/python/test/predictor_test.py +394 -0
easy_rec/python/test/rtp_convert_test.py +133 -0
easy_rec/python/test/run.py +138 -0
easy_rec/python/test/train_eval_test.py +1299 -0
easy_rec/python/test/util_test.py +85 -0
easy_rec/python/test/zero_inflated_lognormal_test.py +53 -0
easy_rec/python/tools/__init__.py +0 -0
easy_rec/python/tools/add_boundaries_to_config.py +67 -0
easy_rec/python/tools/add_feature_info_to_config.py +145 -0
easy_rec/python/tools/convert_config_format.py +48 -0
easy_rec/python/tools/convert_rtp_data.py +79 -0
easy_rec/python/tools/convert_rtp_fg.py +106 -0
easy_rec/python/tools/create_config_from_excel.py +427 -0
easy_rec/python/tools/criteo/__init__.py +0 -0
easy_rec/python/tools/criteo/convert_data.py +157 -0
easy_rec/python/tools/edit_lookup_graph.py +134 -0
easy_rec/python/tools/faiss_index_pai.py +116 -0
easy_rec/python/tools/feature_selection.py +316 -0
easy_rec/python/tools/hit_rate_ds.py +223 -0
easy_rec/python/tools/hit_rate_pai.py +138 -0
easy_rec/python/tools/pre_check.py +120 -0
easy_rec/python/tools/predict_and_chk.py +111 -0
easy_rec/python/tools/read_kafka.py +55 -0
easy_rec/python/tools/split_model_pai.py +286 -0
easy_rec/python/tools/split_pdn_model_pai.py +272 -0
easy_rec/python/tools/test_saved_model.py +80 -0
easy_rec/python/tools/view_saved_model.py +39 -0
easy_rec/python/tools/write_kafka.py +65 -0
easy_rec/python/train_eval.py +325 -0
easy_rec/python/utils/__init__.py +15 -0
easy_rec/python/utils/activation.py +120 -0
easy_rec/python/utils/check_utils.py +87 -0
easy_rec/python/utils/compat.py +14 -0
easy_rec/python/utils/config_util.py +652 -0
easy_rec/python/utils/constant.py +43 -0
easy_rec/python/utils/convert_rtp_fg.py +616 -0
easy_rec/python/utils/dag.py +192 -0
easy_rec/python/utils/distribution_utils.py +268 -0
easy_rec/python/utils/ds_util.py +65 -0
easy_rec/python/utils/embedding_utils.py +73 -0
easy_rec/python/utils/estimator_utils.py +1036 -0
easy_rec/python/utils/export_big_model.py +630 -0
easy_rec/python/utils/expr_util.py +118 -0
easy_rec/python/utils/fg_util.py +53 -0
easy_rec/python/utils/hit_rate_utils.py +220 -0
easy_rec/python/utils/hive_utils.py +183 -0
easy_rec/python/utils/hpo_util.py +137 -0
easy_rec/python/utils/hvd_utils.py +56 -0
easy_rec/python/utils/input_utils.py +108 -0
easy_rec/python/utils/io_util.py +282 -0
easy_rec/python/utils/load_class.py +249 -0
easy_rec/python/utils/meta_graph_editor.py +941 -0
easy_rec/python/utils/multi_optimizer.py +62 -0
easy_rec/python/utils/numpy_utils.py +18 -0
easy_rec/python/utils/odps_util.py +79 -0
easy_rec/python/utils/pai_util.py +86 -0
easy_rec/python/utils/proto_util.py +90 -0
easy_rec/python/utils/restore_filter.py +89 -0
easy_rec/python/utils/shape_utils.py +432 -0
easy_rec/python/utils/static_shape.py +71 -0
easy_rec/python/utils/test_utils.py +866 -0
easy_rec/python/utils/tf_utils.py +56 -0
easy_rec/version.py +4 -0
test/__init__.py +0 -0

easy_rec/python/input/datahub_input.py ADDED Viewed

@@ -0,0 +1,320 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import logging
+import traceback
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from easy_rec.python.input.input import Input
+from easy_rec.python.utils import odps_util
+from easy_rec.python.utils.config_util import parse_time
+if tf.__version__.startswith('1.'):
+  from tensorflow.python.platform import gfile
+else:
+  import tensorflow.io.gfile as gfile
+try:
+  import common_io
+except Exception:
+  common_io = None
+try:
+  from datahub import DataHub
+  from datahub.exceptions import DatahubException
+  from datahub.models import RecordType
+  from datahub.models import CursorType
+  import urllib3
+  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+  logging.getLogger('datahub.account').setLevel(logging.INFO)
+except Exception:
+  logging.warning(
+      'DataHub is not installed[%s]. You can install it by: pip install pydatahub'
+      % traceback.format_exc())
+  DataHub = None
+class DataHubInput(Input):
+  """DataHubInput is used for online train."""
+  def __init__(self,
+               data_config,
+               feature_config,
+               datahub_config,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None):
+    super(DataHubInput,
+          self).__init__(data_config, feature_config, '', task_index, task_num,
+                         check_mode, pipeline_config)
+    if DataHub is None:
+      logging.error('please install datahub: ',
+                    'pip install pydatahub ;Python 3.6 recommended')
+    try:
+      self._num_epoch = 0
+      self._datahub_config = datahub_config
+      if self._datahub_config is not None:
+        akId = self._datahub_config.akId
+        akSecret = self._datahub_config.akSecret
+        endpoint = self._datahub_config.endpoint
+        if not isinstance(akId, str):
+          akId = akId.encode('utf-8')
+          akSecret = akSecret.encode('utf-8')
+          endpoint = endpoint.encode('utf-8')
+        self._datahub = DataHub(akId, akSecret, endpoint)
+      else:
+        self._datahub = None
+    except Exception as ex:
+      logging.info('exception in init datahub: %s' % str(ex))
+      pass
+    self._offset_dict = {}
+    if datahub_config:
+      shard_result = self._datahub.list_shard(self._datahub_config.project,
+                                              self._datahub_config.topic)
+      shards = shard_result.shards
+      self._all_shards = shards
+      self._shards = [
+          shards[i] for i in range(len(shards)) if (i % task_num) == task_index
+      ]
+      logging.info('all shards: %s' % str(self._shards))
+      offset_type = datahub_config.WhichOneof('offset')
+      if offset_type == 'offset_time':
+        ts = parse_time(datahub_config.offset_time) * 1000
+        for x in self._all_shards:
+          ks = str(x.shard_id)
+          cursor_result = self._datahub.get_cursor(self._datahub_config.project,
+                                                   self._datahub_config.topic,
+                                                   ks, CursorType.SYSTEM_TIME,
+                                                   ts)
+          logging.info('shard[%s] cursor = %s' % (ks, cursor_result))
+          self._offset_dict[ks] = cursor_result.cursor
+      elif offset_type == 'offset_info':
+        self._offset_dict = json.loads(self._datahub_config.offset_info)
+      else:
+        self._offset_dict = {}
+      self._dh_field_names = []
+      self._dh_field_types = []
+      topic_info = self._datahub.get_topic(
+          project_name=self._datahub_config.project,
+          topic_name=self._datahub_config.topic)
+      for field in topic_info.record_schema.field_list:
+        self._dh_field_names.append(field.name)
+        self._dh_field_types.append(field.type.value)
+      assert len(
+          self._feature_fields) > 0, 'data_config.feature_fields are not set.'
+      for x in self._feature_fields:
+        assert x in self._dh_field_names, 'feature_field[%s] is not in datahub' % x
+      # feature column ids in datahub schema
+      self._dh_fea_ids = [
+          self._dh_field_names.index(x) for x in self._feature_fields
+      ]
+      for x in self._label_fields:
+        assert x in self._dh_field_names, 'label_field[%s] is not in datahub' % x
+      if self._data_config.HasField('sample_weight'):
+        x = self._data_config.sample_weight
+        assert x in self._dh_field_names, 'sample_weight[%s] is not in datahub' % x
+      self._read_cnt = 32
+      if len(self._dh_fea_ids) > 1:
+        self._filter_fea_func = lambda record: ''.join(
+            [record.values[x]
+             for x in self._dh_fea_ids]).split(chr(2))[1] == '-1024'
+      else:
+        dh_fea_id = self._dh_fea_ids[0]
+        self._filter_fea_func = lambda record: record.values[dh_fea_id].split(
+            self._data_config.separator)[1] == '-1024'
+  def _parse_record(self, *fields):
+    field_dict = {}
+    fields = list(fields)
+    def _dump_offsets():
+      all_offsets = {
+          x.shard_id: self._offset_dict[x.shard_id]
+          for x in self._shards
+          if x.shard_id in self._offset_dict
+      }
+      return json.dumps(all_offsets)
+    field_dict[Input.DATA_OFFSET] = tf.py_func(_dump_offsets, [], dtypes.string)
+    for x in self._label_fields:
+      dh_id = self._dh_field_names.index(x)
+      field_dict[x] = fields[dh_id]
+    feature_inputs = self.get_feature_input_fields()
+    # only for features, labels and sample_weight excluded
+    record_types = [
+        t for x, t in zip(self._input_fields, self._input_field_types)
+        if x in feature_inputs
+    ]
+    feature_num = len(record_types)
+    feature_fields = [
+        fields[self._dh_field_names.index(x)] for x in self._feature_fields
+    ]
+    feature = feature_fields[0]
+    for fea_id in range(1, len(feature_fields)):
+      feature = feature + self._data_config.separator + feature_fields[fea_id]
+    feature = tf.string_split(
+        feature, self._data_config.separator, skip_empty=False)
+    fields = tf.reshape(feature.values, [-1, feature_num])
+    for fid in range(feature_num):
+      field_dict[feature_inputs[fid]] = fields[:, fid]
+    return field_dict
+  def _preprocess(self, field_dict):
+    output_dict = super(DataHubInput, self)._preprocess(field_dict)
+    # append offset fields
+    if Input.DATA_OFFSET in field_dict:
+      output_dict[Input.DATA_OFFSET] = field_dict[Input.DATA_OFFSET]
+    # for _get_features to include DATA_OFFSET
+    if Input.DATA_OFFSET not in self._appended_fields:
+      self._appended_fields.append(Input.DATA_OFFSET)
+    return output_dict
+  def restore(self, checkpoint_path):
+    if checkpoint_path is None:
+      return
+    offset_path = checkpoint_path + '.offset'
+    if not gfile.Exists(offset_path):
+      return
+    logging.info('will restore datahub offset from  %s' % offset_path)
+    with gfile.GFile(offset_path, 'r') as fin:
+      offset_dict = json.load(fin)
+      for k in offset_dict:
+        v = offset_dict[k]
+        ks = str(k)
+        if ks not in self._offset_dict or v > self._offset_dict[ks]:
+          self._offset_dict[ks] = v
+  def _is_data_empty(self, record):
+    is_empty = True
+    for fid in self._dh_fea_ids:
+      if record.values[fid] is not None and len(record.values[fid]) > 0:
+        is_empty = False
+        break
+    return is_empty
+  def _dump_record(self, record):
+    feas = []
+    for fid in range(len(record.values)):
+      if fid not in self._dh_fea_ids:
+        feas.append(self._dh_field_names[fid] + ':' + str(record.values[fid]))
+    return ';'.join(feas)
+  def _datahub_generator(self):
+    logging.info('start epoch[%d]' % self._num_epoch)
+    self._num_epoch += 1
+    try:
+      self._datahub.wait_shards_ready(self._datahub_config.project,
+                                      self._datahub_config.topic)
+      topic_result = self._datahub.get_topic(self._datahub_config.project,
+                                             self._datahub_config.topic)
+      if topic_result.record_type != RecordType.TUPLE:
+        logging.error('datahub topic type(%s) illegal' %
+                      str(topic_result.record_type))
+      record_schema = topic_result.record_schema
+      tid = 0
+      while True:
+        shard_id = self._shards[tid].shard_id
+        tid += 1
+        if tid >= len(self._shards):
+          tid = 0
+        if shard_id not in self._offset_dict:
+          cursor_result = self._datahub.get_cursor(self._datahub_config.project,
+                                                   self._datahub_config.topic,
+                                                   shard_id, CursorType.OLDEST)
+          cursor = cursor_result.cursor
+        else:
+          cursor = self._offset_dict[shard_id]
+        get_result = self._datahub.get_tuple_records(
+            self._datahub_config.project, self._datahub_config.topic, shard_id,
+            record_schema, cursor, self._read_cnt)
+        count = get_result.record_count
+        if count == 0:
+          continue
+        for row_id, record in enumerate(get_result.records):
+          if self._is_data_empty(record):
+            logging.warning('skip empty data record: %s' %
+                            self._dump_record(record))
+            continue
+          if self._filter_fea_func is not None:
+            if self._filter_fea_func(record):
+              logging.warning('filter data record: %s' %
+                              self._dump_record(record))
+              continue
+          yield tuple(list(record.values))
+        if shard_id not in self._offset_dict or get_result.next_cursor > self._offset_dict[
+            shard_id]:
+          self._offset_dict[shard_id] = get_result.next_cursor
+    except DatahubException as ex:
+      logging.error('DatahubException: %s' % str(ex))
+  def _build(self, mode, params):
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      assert self._datahub is not None, 'datahub_train_input is not set'
+    elif mode == tf.estimator.ModeKeys.EVAL:
+      assert self._datahub is not None, 'datahub_eval_input is not set'
+    # get input types
+    list_types = [
+        odps_util.odps_type_2_tf_type(x) for x in self._dh_field_types
+    ]
+    list_types = tuple(list_types)
+    list_shapes = [
+        tf.TensorShape([]) for x in range(0, len(self._dh_field_types))
+    ]
+    list_shapes = tuple(list_shapes)
+    # read datahub
+    dataset = tf.data.Dataset.from_generator(
+        self._datahub_generator,
+        output_types=list_types,
+        output_shapes=list_shapes)
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      if self._data_config.shuffle:
+        dataset = dataset.shuffle(
+            self._data_config.shuffle_buffer_size,
+            seed=2020,
+            reshuffle_each_iteration=True)
+    dataset = dataset.batch(self._data_config.batch_size)
+    dataset = dataset.map(
+        self._parse_record,
+        num_parallel_calls=self._data_config.num_parallel_calls)
+    # preprocess is necessary to transform data
+    # so that they could be feed into FeatureColumns
+    dataset = dataset.map(
+        map_func=self._preprocess,
+        num_parallel_calls=self._data_config.num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      dataset = dataset.map(lambda x:
+                            (self._get_features(x), self._get_labels(x)))
+    else:
+      dataset = dataset.map(lambda x: (self._get_features(x)))
+    return dataset

easy_rec/python/input/dummy_input.py ADDED Viewed

@@ -0,0 +1,58 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tensorflow as tf
+from easy_rec.python.input.input import Input
+from easy_rec.python.utils.tf_utils import get_tf_type
+if tf.__version__ >= '2.0':
+  tf = tf.compat.v1
+class DummyInput(Input):
+  """Dummy memory input.
+  Dummy Input is used to debug the performance bottleneck of data pipeline.
+  """
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None,
+               input_vals={}):
+    super(DummyInput,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config)
+    self._input_vals = input_vals
+  def _build(self, mode, params):
+    """Build fake constant input.
+    Args:
+      mode: tf.estimator.ModeKeys.TRAIN / tf.estimator.ModeKeys.EVAL / tf.estimator.ModeKeys.PREDICT
+      params: parameters passed by estimator, currently not used
+    Returns:
+      features tensor dict
+      label tensor dict
+    """
+    features = {}
+    for field, field_type, def_val in zip(self._input_fields,
+                                          self._input_field_types,
+                                          self._input_field_defaults):
+      tf_type = get_tf_type(field_type)
+      def_val = self.get_type_defaults(field_type, default_val=def_val)
+      if field in self._input_vals:
+        tensor = self._input_vals[field]
+      else:
+        tensor = tf.constant([def_val] * self._batch_size, dtype=tf_type)
+      features[field] = tensor
+    parse_dict = self._preprocess(features)
+    return self._get_features(parse_dict), self._get_labels(parse_dict)

easy_rec/python/input/hive_input.py ADDED Viewed

@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+import logging
+import os
+import tensorflow as tf
+from easy_rec.python.input.input import Input
+from easy_rec.python.utils.hive_utils import HiveUtils
+class HiveInput(Input):
+  """Common IO based interface, could run at local or on data science."""
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None):
+    super(HiveInput,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config)
+    if input_path is None:
+      return
+    self._data_config = data_config
+    self._feature_config = feature_config
+    self._hive_config = input_path
+    hive_util = HiveUtils(
+        data_config=self._data_config, hive_config=self._hive_config)
+    self._input_hdfs_path = hive_util.get_table_location(
+        self._hive_config.table_name)
+    self._input_table_col_names, self._input_table_col_types = hive_util.get_all_cols(
+        self._hive_config.table_name)
+  def _parse_csv(self, line):
+    record_defaults = []
+    for field_name in self._input_table_col_names:
+      if field_name in self._input_fields:
+        tid = self._input_fields.index(field_name)
+        record_defaults.append(
+            self.get_type_defaults(self._input_field_types[tid],
+                                   self._input_field_defaults[tid]))
+      else:
+        record_defaults.append('')
+    tmp_fields = tf.decode_csv(
+        line,
+        field_delim=self._data_config.separator,
+        record_defaults=record_defaults,
+        name='decode_csv')
+    fields = []
+    for x in self._input_fields:
+      assert x in self._input_table_col_names, 'Column %s not in Table %s.' % (
+          x, self._hive_config.table_name)
+      fields.append(tmp_fields[self._input_table_col_names.index(x)])
+    # filter only valid fields
+    inputs = {self._input_fields[x]: fields[x] for x in self._effective_fids}
+    for x in self._label_fids:
+      inputs[self._input_fields[x]] = fields[x]
+    return inputs
+  def _build(self, mode, params):
+    file_paths = tf.gfile.Glob(os.path.join(self._input_hdfs_path, '*'))
+    assert len(
+        file_paths) > 0, 'match no files with %s' % self._hive_config.table_name
+    num_parallel_calls = self._data_config.num_parallel_calls
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      logging.info('train files[%d]: %s' %
+                   (len(file_paths), ','.join(file_paths)))
+      dataset = tf.data.Dataset.from_tensor_slices(file_paths)
+      if self._data_config.file_shard:
+        dataset = self._safe_shard(dataset)
+      if self._data_config.shuffle:
+        # shuffle input files
+        dataset = dataset.shuffle(len(file_paths))
+      # too many readers read the same file will cause performance issues
+      # as the same data will be read multiple times
+      parallel_num = min(num_parallel_calls, len(file_paths))
+      dataset = dataset.interleave(
+          lambda x: tf.data.TextLineDataset(x),
+          cycle_length=parallel_num,
+          num_parallel_calls=parallel_num)
+      if not self._data_config.file_shard:
+        dataset = self._safe_shard(dataset)
+      if self._data_config.shuffle:
+        dataset = dataset.shuffle(
+            self._data_config.shuffle_buffer_size,
+            seed=2020,
+            reshuffle_each_iteration=True)
+      dataset = dataset.repeat(self.num_epochs)
+    else:
+      logging.info('eval files[%d]: %s' %
+                   (len(file_paths), ','.join(file_paths)))
+      dataset = tf.data.TextLineDataset(file_paths)
+      dataset = dataset.repeat(1)
+    dataset = dataset.batch(self._data_config.batch_size)
+    dataset = dataset.map(
+        self._parse_csv, num_parallel_calls=num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    dataset = dataset.map(
+        map_func=self._preprocess, num_parallel_calls=num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      dataset = dataset.map(lambda x:
+                            (self._get_features(x), self._get_labels(x)))
+    else:
+      dataset = dataset.map(lambda x: (self._get_features(x)))
+    return dataset

easy_rec/python/input/hive_parquet_input.py ADDED Viewed

@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+import logging
+import os
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from easy_rec.python.input.input import Input
+from easy_rec.python.utils.hive_utils import HiveUtils
+from easy_rec.python.utils.tf_utils import get_tf_type
+class HiveParquetInput(Input):
+  """Common IO based interface, could run at local or on data science."""
+  def __init__(self,
+               data_config,
+               feature_config,
+               input_path,
+               task_index=0,
+               task_num=1,
+               check_mode=False,
+               pipeline_config=None):
+    super(HiveParquetInput,
+          self).__init__(data_config, feature_config, input_path, task_index,
+                         task_num, check_mode, pipeline_config)
+    if input_path is None:
+      return
+    self._data_config = data_config
+    self._feature_config = feature_config
+    self._hive_config = input_path
+    hive_util = HiveUtils(
+        data_config=self._data_config, hive_config=self._hive_config)
+    input_hdfs_path = hive_util.get_table_location(self._hive_config.table_name)
+    self._input_table_col_names, self._input_table_col_types = hive_util.get_all_cols(
+        self._hive_config.table_name)
+    self._all_hdfs_path = tf.gfile.Glob(os.path.join(input_hdfs_path, '*'))
+    for x in self._input_fields:
+      assert x in self._input_table_col_names, 'Column %s not in Table %s.' % (
+          x, self._hive_config.table_name)
+    self._record_defaults = [
+        self.get_type_defaults(t, v)
+        for t, v in zip(self._input_field_types, self._input_field_defaults)
+    ]
+  def _file_shard(self, file_paths, task_num, task_index):
+    if self._data_config.chief_redundant:
+      task_num = max(task_num - 1, 1)
+      task_index = max(task_index - 1, 0)
+    task_file_paths = []
+    for idx in range(task_index, len(file_paths), task_num):
+      task_file_paths.append(file_paths[idx])
+    return task_file_paths
+  def _parquet_read(self):
+    for input_path in self._input_hdfs_path:
+      if input_path.endswith('SUCCESS'):
+        continue
+      df = pd.read_parquet(input_path, engine='pyarrow')
+      df = df[self._input_fields]
+      df.replace('', np.nan, inplace=True)
+      df.replace('NULL', np.nan, inplace=True)
+      total_records_num = len(df)
+      for k, v in zip(self._input_fields, self._record_defaults):
+        df[k].fillna(v, inplace=True)
+      for start_idx in range(0, total_records_num,
+                             self._data_config.batch_size):
+        end_idx = min(total_records_num,
+                      start_idx + self._data_config.batch_size)
+        batch_data = df[start_idx:end_idx]
+        inputs = []
+        for k in self._input_fields:
+          inputs.append(batch_data[k].to_numpy())
+        yield tuple(inputs)
+  def _parse_csv(self, *fields):
+    # filter only valid fields
+    inputs = {self._input_fields[x]: fields[x] for x in self._effective_fids}
+    # filter only valid labels
+    for x in self._label_fids:
+      inputs[self._input_fields[x]] = fields[x]
+    return inputs
+  def _build(self, mode, params):
+    # get input type
+    list_type = [get_tf_type(x) for x in self._input_field_types]
+    list_type = tuple(list_type)
+    list_shapes = [tf.TensorShape([None]) for x in range(0, len(list_type))]
+    list_shapes = tuple(list_shapes)
+    if len(self._all_hdfs_path) >= 2 * self._task_num:
+      file_shard = True
+      self._input_hdfs_path = self._file_shard(self._all_hdfs_path,
+                                               self._task_num, self._task_index)
+    else:
+      file_shard = False
+      self._input_hdfs_path = self._all_hdfs_path
+    logging.info('input path: %s' % self._input_hdfs_path)
+    assert len(self._input_hdfs_path
+               ) > 0, 'match no files with %s' % self._hive_config.table_name
+    dataset = tf.data.Dataset.from_generator(
+        self._parquet_read, output_types=list_type, output_shapes=list_shapes)
+    if not file_shard:
+      dataset = self._safe_shard(dataset)
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      dataset = dataset.shuffle(
+          self._data_config.shuffle_buffer_size,
+          seed=2020,
+          reshuffle_each_iteration=True)
+      dataset = dataset.repeat(self.num_epochs)
+    else:
+      dataset = dataset.repeat(1)
+    dataset = dataset.map(
+        self._parse_csv,
+        num_parallel_calls=self._data_config.num_parallel_calls)
+    # preprocess is necessary to transform data
+    # so that they could be feed into FeatureColumns
+    dataset = dataset.map(
+        map_func=self._preprocess,
+        num_parallel_calls=self._data_config.num_parallel_calls)
+    dataset = dataset.prefetch(buffer_size=self._prefetch_size)
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      dataset = dataset.map(lambda x:
+                            (self._get_features(x), self._get_labels(x)))
+    else:
+      dataset = dataset.map(lambda x: (self._get_features(x)))
+    return dataset