PyPI - easy-cs-rec-custommodel - Versions diffs - 0.8.6__py2.py3-none-any.whl - Mend

easy-cs-rec-custommodel 0.8.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of easy-cs-rec-custommodel might be problematic. Click here for more details.

Files changed (336) hide show

easy_cs_rec_custommodel-0.8.6.dist-info/LICENSE +203 -0
easy_cs_rec_custommodel-0.8.6.dist-info/METADATA +48 -0
easy_cs_rec_custommodel-0.8.6.dist-info/RECORD +336 -0
easy_cs_rec_custommodel-0.8.6.dist-info/WHEEL +6 -0
easy_cs_rec_custommodel-0.8.6.dist-info/top_level.txt +2 -0
easy_rec/__init__.py +114 -0
easy_rec/python/__init__.py +0 -0
easy_rec/python/builders/__init__.py +0 -0
easy_rec/python/builders/hyperparams_builder.py +78 -0
easy_rec/python/builders/loss_builder.py +333 -0
easy_rec/python/builders/optimizer_builder.py +211 -0
easy_rec/python/builders/strategy_builder.py +44 -0
easy_rec/python/compat/__init__.py +0 -0
easy_rec/python/compat/adam_s.py +245 -0
easy_rec/python/compat/array_ops.py +229 -0
easy_rec/python/compat/dynamic_variable.py +542 -0
easy_rec/python/compat/early_stopping.py +653 -0
easy_rec/python/compat/embedding_ops.py +162 -0
easy_rec/python/compat/embedding_parallel_saver.py +316 -0
easy_rec/python/compat/estimator_train.py +116 -0
easy_rec/python/compat/exporter.py +473 -0
easy_rec/python/compat/feature_column/__init__.py +0 -0
easy_rec/python/compat/feature_column/feature_column.py +3675 -0
easy_rec/python/compat/feature_column/feature_column_v2.py +5233 -0
easy_rec/python/compat/feature_column/sequence_feature_column.py +648 -0
easy_rec/python/compat/feature_column/utils.py +154 -0
easy_rec/python/compat/layers.py +329 -0
easy_rec/python/compat/ops.py +14 -0
easy_rec/python/compat/optimizers.py +619 -0
easy_rec/python/compat/queues.py +311 -0
easy_rec/python/compat/regularizers.py +208 -0
easy_rec/python/compat/sok_optimizer.py +440 -0
easy_rec/python/compat/sync_replicas_optimizer.py +528 -0
easy_rec/python/compat/weight_decay_optimizers.py +475 -0
easy_rec/python/core/__init__.py +0 -0
easy_rec/python/core/easyrec_metrics/__init__.py +24 -0
easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_pai.py +3702 -0
easy_rec/python/core/easyrec_metrics/distribute_metrics_impl_tf.py +3768 -0
easy_rec/python/core/learning_schedules.py +228 -0
easy_rec/python/core/metrics.py +402 -0
easy_rec/python/core/sampler.py +844 -0
easy_rec/python/eval.py +102 -0
easy_rec/python/export.py +150 -0
easy_rec/python/feature_column/__init__.py +0 -0
easy_rec/python/feature_column/feature_column.py +664 -0
easy_rec/python/feature_column/feature_group.py +89 -0
easy_rec/python/hpo/__init__.py +0 -0
easy_rec/python/hpo/emr_hpo.py +140 -0
easy_rec/python/hpo/generate_hpo_sql.py +71 -0
easy_rec/python/hpo/pai_hpo.py +297 -0
easy_rec/python/inference/__init__.py +0 -0
easy_rec/python/inference/csv_predictor.py +189 -0
easy_rec/python/inference/hive_parquet_predictor.py +200 -0
easy_rec/python/inference/hive_predictor.py +166 -0
easy_rec/python/inference/odps_predictor.py +70 -0
easy_rec/python/inference/parquet_predictor.py +147 -0
easy_rec/python/inference/parquet_predictor_v2.py +147 -0
easy_rec/python/inference/predictor.py +621 -0
easy_rec/python/inference/processor/__init__.py +0 -0
easy_rec/python/inference/processor/test.py +170 -0
easy_rec/python/inference/vector_retrieve.py +124 -0
easy_rec/python/input/__init__.py +0 -0
easy_rec/python/input/batch_tfrecord_input.py +117 -0
easy_rec/python/input/criteo_binary_reader.py +259 -0
easy_rec/python/input/criteo_input.py +107 -0
easy_rec/python/input/csv_input.py +175 -0
easy_rec/python/input/csv_input_ex.py +72 -0
easy_rec/python/input/csv_input_v2.py +68 -0
easy_rec/python/input/datahub_input.py +320 -0
easy_rec/python/input/dummy_input.py +58 -0
easy_rec/python/input/hive_input.py +123 -0
easy_rec/python/input/hive_parquet_input.py +140 -0
easy_rec/python/input/hive_rtp_input.py +174 -0
easy_rec/python/input/input.py +1064 -0
easy_rec/python/input/kafka_dataset.py +144 -0
easy_rec/python/input/kafka_input.py +235 -0
easy_rec/python/input/load_parquet.py +317 -0
easy_rec/python/input/odps_input.py +101 -0
easy_rec/python/input/odps_input_v2.py +110 -0
easy_rec/python/input/odps_input_v3.py +132 -0
easy_rec/python/input/odps_rtp_input.py +187 -0
easy_rec/python/input/odps_rtp_input_v2.py +104 -0
easy_rec/python/input/parquet_input.py +397 -0
easy_rec/python/input/parquet_input_v2.py +180 -0
easy_rec/python/input/parquet_input_v3.py +203 -0
easy_rec/python/input/rtp_input.py +225 -0
easy_rec/python/input/rtp_input_v2.py +145 -0
easy_rec/python/input/tfrecord_input.py +100 -0
easy_rec/python/layers/__init__.py +0 -0
easy_rec/python/layers/backbone.py +571 -0
easy_rec/python/layers/capsule_layer.py +176 -0
easy_rec/python/layers/cmbf.py +390 -0
easy_rec/python/layers/common_layers.py +192 -0
easy_rec/python/layers/dnn.py +87 -0
easy_rec/python/layers/embed_input_layer.py +25 -0
easy_rec/python/layers/fm.py +26 -0
easy_rec/python/layers/input_layer.py +396 -0
easy_rec/python/layers/keras/__init__.py +34 -0
easy_rec/python/layers/keras/activation.py +114 -0
easy_rec/python/layers/keras/attention.py +267 -0
easy_rec/python/layers/keras/auxiliary_loss.py +47 -0
easy_rec/python/layers/keras/blocks.py +262 -0
easy_rec/python/layers/keras/bst.py +119 -0
easy_rec/python/layers/keras/custom_ops.py +250 -0
easy_rec/python/layers/keras/data_augment.py +133 -0
easy_rec/python/layers/keras/din.py +67 -0
easy_rec/python/layers/keras/einsum_dense.py +598 -0
easy_rec/python/layers/keras/embedding.py +81 -0
easy_rec/python/layers/keras/fibinet.py +251 -0
easy_rec/python/layers/keras/interaction.py +416 -0
easy_rec/python/layers/keras/layer_norm.py +364 -0
easy_rec/python/layers/keras/mask_net.py +166 -0
easy_rec/python/layers/keras/multi_head_attention.py +717 -0
easy_rec/python/layers/keras/multi_task.py +125 -0
easy_rec/python/layers/keras/numerical_embedding.py +376 -0
easy_rec/python/layers/keras/ppnet.py +194 -0
easy_rec/python/layers/keras/transformer.py +192 -0
easy_rec/python/layers/layer_norm.py +51 -0
easy_rec/python/layers/mmoe.py +83 -0
easy_rec/python/layers/multihead_attention.py +162 -0
easy_rec/python/layers/multihead_cross_attention.py +749 -0
easy_rec/python/layers/senet.py +73 -0
easy_rec/python/layers/seq_input_layer.py +134 -0
easy_rec/python/layers/sequence_feature_layer.py +249 -0
easy_rec/python/layers/uniter.py +301 -0
easy_rec/python/layers/utils.py +248 -0
easy_rec/python/layers/variational_dropout_layer.py +130 -0
easy_rec/python/loss/__init__.py +0 -0
easy_rec/python/loss/circle_loss.py +82 -0
easy_rec/python/loss/contrastive_loss.py +79 -0
easy_rec/python/loss/f1_reweight_loss.py +38 -0
easy_rec/python/loss/focal_loss.py +93 -0
easy_rec/python/loss/jrc_loss.py +128 -0
easy_rec/python/loss/listwise_loss.py +161 -0
easy_rec/python/loss/multi_similarity.py +68 -0
easy_rec/python/loss/pairwise_loss.py +307 -0
easy_rec/python/loss/softmax_loss_with_negative_mining.py +110 -0
easy_rec/python/loss/zero_inflated_lognormal.py +76 -0
easy_rec/python/main.py +878 -0
easy_rec/python/model/__init__.py +0 -0
easy_rec/python/model/autoint.py +73 -0
easy_rec/python/model/cmbf.py +47 -0
easy_rec/python/model/collaborative_metric_learning.py +182 -0
easy_rec/python/model/custom_model.py +323 -0
easy_rec/python/model/dat.py +138 -0
easy_rec/python/model/dbmtl.py +116 -0
easy_rec/python/model/dcn.py +70 -0
easy_rec/python/model/deepfm.py +106 -0
easy_rec/python/model/dlrm.py +73 -0
easy_rec/python/model/dropoutnet.py +207 -0
easy_rec/python/model/dssm.py +154 -0
easy_rec/python/model/dssm_senet.py +143 -0
easy_rec/python/model/dummy_model.py +48 -0
easy_rec/python/model/easy_rec_estimator.py +739 -0
easy_rec/python/model/easy_rec_model.py +467 -0
easy_rec/python/model/esmm.py +242 -0
easy_rec/python/model/fm.py +63 -0
easy_rec/python/model/match_model.py +357 -0
easy_rec/python/model/mind.py +445 -0
easy_rec/python/model/mmoe.py +70 -0
easy_rec/python/model/multi_task_model.py +303 -0
easy_rec/python/model/multi_tower.py +62 -0
easy_rec/python/model/multi_tower_bst.py +190 -0
easy_rec/python/model/multi_tower_din.py +130 -0
easy_rec/python/model/multi_tower_recall.py +68 -0
easy_rec/python/model/pdn.py +203 -0
easy_rec/python/model/ple.py +120 -0
easy_rec/python/model/rank_model.py +485 -0
easy_rec/python/model/rocket_launching.py +203 -0
easy_rec/python/model/simple_multi_task.py +54 -0
easy_rec/python/model/uniter.py +46 -0
easy_rec/python/model/wide_and_deep.py +121 -0
easy_rec/python/ops/1.12/incr_record.so +0 -0
easy_rec/python/ops/1.12/kafka.so +0 -0
easy_rec/python/ops/1.12/libcustom_ops.so +0 -0
easy_rec/python/ops/1.12/libembed_op.so +0 -0
easy_rec/python/ops/1.12/libhiredis.so.1.0.0 +0 -0
easy_rec/python/ops/1.12/librdkafka++.so.1 +0 -0
easy_rec/python/ops/1.12/librdkafka.so.1 +0 -0
easy_rec/python/ops/1.12/libredis++.so +0 -0
easy_rec/python/ops/1.12/libredis++.so.1 +0 -0
easy_rec/python/ops/1.12/libredis++.so.1.2.3 +0 -0
easy_rec/python/ops/1.12/libstr_avx_op.so +0 -0
easy_rec/python/ops/1.12/libwrite_sparse_kv.so +0 -0
easy_rec/python/ops/1.15/incr_record.so +0 -0
easy_rec/python/ops/1.15/kafka.so +0 -0
easy_rec/python/ops/1.15/libcustom_ops.so +0 -0
easy_rec/python/ops/1.15/libembed_op.so +0 -0
easy_rec/python/ops/1.15/libhiredis.so.1.0.0 +0 -0
easy_rec/python/ops/1.15/librdkafka++.so +0 -0
easy_rec/python/ops/1.15/librdkafka++.so.1 +0 -0
easy_rec/python/ops/1.15/librdkafka.so +0 -0
easy_rec/python/ops/1.15/librdkafka.so.1 +0 -0
easy_rec/python/ops/1.15/libredis++.so.1 +0 -0
easy_rec/python/ops/1.15/libstr_avx_op.so +0 -0
easy_rec/python/ops/2.12/libcustom_ops.so +0 -0
easy_rec/python/ops/2.12/libload_embed.so +0 -0
easy_rec/python/ops/2.12/libstr_avx_op.so +0 -0
easy_rec/python/ops/__init__.py +0 -0
easy_rec/python/ops/gen_kafka_ops.py +193 -0
easy_rec/python/ops/gen_str_avx_op.py +28 -0
easy_rec/python/ops/incr_record.py +30 -0
easy_rec/python/predict.py +170 -0
easy_rec/python/protos/__init__.py +0 -0
easy_rec/python/protos/autoint_pb2.py +122 -0
easy_rec/python/protos/backbone_pb2.py +1416 -0
easy_rec/python/protos/cmbf_pb2.py +435 -0
easy_rec/python/protos/collaborative_metric_learning_pb2.py +252 -0
easy_rec/python/protos/custom_model_pb2.py +57 -0
easy_rec/python/protos/dat_pb2.py +262 -0
easy_rec/python/protos/data_source_pb2.py +422 -0
easy_rec/python/protos/dataset_pb2.py +1920 -0
easy_rec/python/protos/dbmtl_pb2.py +191 -0
easy_rec/python/protos/dcn_pb2.py +197 -0
easy_rec/python/protos/deepfm_pb2.py +163 -0
easy_rec/python/protos/dlrm_pb2.py +163 -0
easy_rec/python/protos/dnn_pb2.py +329 -0
easy_rec/python/protos/dropoutnet_pb2.py +239 -0
easy_rec/python/protos/dssm_pb2.py +262 -0
easy_rec/python/protos/dssm_senet_pb2.py +282 -0
easy_rec/python/protos/easy_rec_model_pb2.py +1672 -0
easy_rec/python/protos/esmm_pb2.py +133 -0
easy_rec/python/protos/eval_pb2.py +930 -0
easy_rec/python/protos/export_pb2.py +379 -0
easy_rec/python/protos/feature_config_pb2.py +1359 -0
easy_rec/python/protos/fm_pb2.py +90 -0
easy_rec/python/protos/hive_config_pb2.py +138 -0
easy_rec/python/protos/hyperparams_pb2.py +624 -0
easy_rec/python/protos/keras_layer_pb2.py +692 -0
easy_rec/python/protos/layer_pb2.py +1936 -0
easy_rec/python/protos/loss_pb2.py +1713 -0
easy_rec/python/protos/mind_pb2.py +497 -0
easy_rec/python/protos/mmoe_pb2.py +215 -0
easy_rec/python/protos/multi_tower_pb2.py +295 -0
easy_rec/python/protos/multi_tower_recall_pb2.py +198 -0
easy_rec/python/protos/optimizer_pb2.py +2017 -0
easy_rec/python/protos/pdn_pb2.py +293 -0
easy_rec/python/protos/pipeline_pb2.py +516 -0
easy_rec/python/protos/ple_pb2.py +231 -0
easy_rec/python/protos/predict_pb2.py +1140 -0
easy_rec/python/protos/rocket_launching_pb2.py +169 -0
easy_rec/python/protos/seq_encoder_pb2.py +1084 -0
easy_rec/python/protos/simi_pb2.py +54 -0
easy_rec/python/protos/simple_multi_task_pb2.py +97 -0
easy_rec/python/protos/tf_predict_pb2.py +630 -0
easy_rec/python/protos/tower_pb2.py +661 -0
easy_rec/python/protos/train_pb2.py +1197 -0
easy_rec/python/protos/uniter_pb2.py +307 -0
easy_rec/python/protos/variational_dropout_pb2.py +91 -0
easy_rec/python/protos/wide_and_deep_pb2.py +131 -0
easy_rec/python/test/__init__.py +0 -0
easy_rec/python/test/csv_input_test.py +340 -0
easy_rec/python/test/custom_early_stop_func.py +19 -0
easy_rec/python/test/dh_local_run.py +104 -0
easy_rec/python/test/embed_test.py +155 -0
easy_rec/python/test/emr_run.py +119 -0
easy_rec/python/test/eval_metric_test.py +107 -0
easy_rec/python/test/excel_convert_test.py +64 -0
easy_rec/python/test/export_test.py +513 -0
easy_rec/python/test/fg_test.py +70 -0
easy_rec/python/test/hive_input_test.py +311 -0
easy_rec/python/test/hpo_test.py +235 -0
easy_rec/python/test/kafka_test.py +373 -0
easy_rec/python/test/local_incr_test.py +122 -0
easy_rec/python/test/loss_test.py +110 -0
easy_rec/python/test/odps_command.py +61 -0
easy_rec/python/test/odps_local_run.py +86 -0
easy_rec/python/test/odps_run.py +254 -0
easy_rec/python/test/odps_test_cls.py +39 -0
easy_rec/python/test/odps_test_prepare.py +198 -0
easy_rec/python/test/odps_test_util.py +237 -0
easy_rec/python/test/pre_check_test.py +54 -0
easy_rec/python/test/predictor_test.py +394 -0
easy_rec/python/test/rtp_convert_test.py +133 -0
easy_rec/python/test/run.py +138 -0
easy_rec/python/test/train_eval_test.py +1299 -0
easy_rec/python/test/util_test.py +85 -0
easy_rec/python/test/zero_inflated_lognormal_test.py +53 -0
easy_rec/python/tools/__init__.py +0 -0
easy_rec/python/tools/add_boundaries_to_config.py +67 -0
easy_rec/python/tools/add_feature_info_to_config.py +145 -0
easy_rec/python/tools/convert_config_format.py +48 -0
easy_rec/python/tools/convert_rtp_data.py +79 -0
easy_rec/python/tools/convert_rtp_fg.py +106 -0
easy_rec/python/tools/create_config_from_excel.py +427 -0
easy_rec/python/tools/criteo/__init__.py +0 -0
easy_rec/python/tools/criteo/convert_data.py +157 -0
easy_rec/python/tools/edit_lookup_graph.py +134 -0
easy_rec/python/tools/faiss_index_pai.py +116 -0
easy_rec/python/tools/feature_selection.py +316 -0
easy_rec/python/tools/hit_rate_ds.py +223 -0
easy_rec/python/tools/hit_rate_pai.py +138 -0
easy_rec/python/tools/pre_check.py +120 -0
easy_rec/python/tools/predict_and_chk.py +111 -0
easy_rec/python/tools/read_kafka.py +55 -0
easy_rec/python/tools/split_model_pai.py +286 -0
easy_rec/python/tools/split_pdn_model_pai.py +272 -0
easy_rec/python/tools/test_saved_model.py +80 -0
easy_rec/python/tools/view_saved_model.py +39 -0
easy_rec/python/tools/write_kafka.py +65 -0
easy_rec/python/train_eval.py +325 -0
easy_rec/python/utils/__init__.py +15 -0
easy_rec/python/utils/activation.py +120 -0
easy_rec/python/utils/check_utils.py +87 -0
easy_rec/python/utils/compat.py +14 -0
easy_rec/python/utils/config_util.py +652 -0
easy_rec/python/utils/constant.py +43 -0
easy_rec/python/utils/convert_rtp_fg.py +616 -0
easy_rec/python/utils/dag.py +192 -0
easy_rec/python/utils/distribution_utils.py +268 -0
easy_rec/python/utils/ds_util.py +65 -0
easy_rec/python/utils/embedding_utils.py +73 -0
easy_rec/python/utils/estimator_utils.py +1036 -0
easy_rec/python/utils/export_big_model.py +630 -0
easy_rec/python/utils/expr_util.py +118 -0
easy_rec/python/utils/fg_util.py +53 -0
easy_rec/python/utils/hit_rate_utils.py +220 -0
easy_rec/python/utils/hive_utils.py +183 -0
easy_rec/python/utils/hpo_util.py +137 -0
easy_rec/python/utils/hvd_utils.py +56 -0
easy_rec/python/utils/input_utils.py +108 -0
easy_rec/python/utils/io_util.py +282 -0
easy_rec/python/utils/load_class.py +249 -0
easy_rec/python/utils/meta_graph_editor.py +941 -0
easy_rec/python/utils/multi_optimizer.py +62 -0
easy_rec/python/utils/numpy_utils.py +18 -0
easy_rec/python/utils/odps_util.py +79 -0
easy_rec/python/utils/pai_util.py +86 -0
easy_rec/python/utils/proto_util.py +90 -0
easy_rec/python/utils/restore_filter.py +89 -0
easy_rec/python/utils/shape_utils.py +432 -0
easy_rec/python/utils/static_shape.py +71 -0
easy_rec/python/utils/test_utils.py +866 -0
easy_rec/python/utils/tf_utils.py +56 -0
easy_rec/version.py +4 -0
test/__init__.py +0 -0

easy_rec/python/tools/create_config_from_excel.py ADDED Viewed

@@ -0,0 +1,427 @@
+# -*-encoding:utf-8-*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import math
+import sys
+import numpy as np
+import pandas as pd
+from easy_rec.python.utils import config_util
+logging.basicConfig(
+    level=logging.INFO, format='[%(asctime)s][%(levelname)s] %(message)s')
+class ModelConfigConverter:
+  def __init__(self, excel_path, output_path, model_type, column_separator,
+               incol_separator, train_input_path, eval_input_path, model_dir):
+    self._excel_path = excel_path
+    self._output_path = output_path
+    self._model_type = model_type
+    self._column_separator = column_separator
+    self._incol_separator = incol_separator
+    self._dict_global = self._parse_global()
+    self._tower_dicts = {}
+    self._feature_names = []
+    self._feature_details = {}
+    self._label = ''
+    self._train_input_path = train_input_path
+    self._eval_input_path = eval_input_path
+    self._model_dir = model_dir
+    if not self._model_dir:
+      self._model_dir = 'experiments/demo'
+      logging.warning('model_dir is not specified, set to %s' % self._model_dir)
+  def _get_type_name(self, input_name):
+    type_dict = {
+        'bigint': 'INT64',
+        'double': 'DOUBLE',
+        'float': 'FLOAT',
+        'string': 'STRING',
+        'bool': 'BOOL'
+    }
+    return type_dict[input_name]
+  def _get_type_default(self, input_name):
+    type_dict = {
+        'bigint': '0',
+        'double': '0.0',
+        'float': '0.0',
+        'string': '',
+        'bool': 'false'
+    }
+    return type_dict[input_name]
+  def _parse_global(self):
+    df = pd.read_excel(self._excel_path, sheet_name='global')
+    dict_global = {}
+    for i, row in df.iterrows():
+      field = {}
+      name = field['name'] = row['name'].strip()
+      field['type_name'] = row['type']
+      field['hash_bucket_size'] = row['hash_bucket_size']
+      field['embedding_dim'] = row['embedding_dim']
+      field['default_value'] = row['default_value']
+      dict_global[name] = field
+    return dict_global
+  def _add_to_tower(self, tower_name, field):
+    if tower_name.lower() == 'nan':
+      return
+    if tower_name != 'label':
+      if self._model_type == 'deepfm':
+        if tower_name == 'deep':
+          tower_names = ['deep']
+        elif tower_name == 'wide':
+          tower_names = ['wide']
+        elif tower_name == 'wide_and_deep':
+          tower_names = ['wide', 'deep']
+        else:
+          raise ValueError(
+              'invalid tower_name[%s] for deepfm model, '
+              'only[label, deep, wide, wide_and_deep are supported]' %
+              tower_name)
+        for tower_name in tower_names:
+          if tower_name in self._tower_dicts:
+            self._tower_dicts[tower_name].append(field)
+          else:
+            self._tower_dicts[tower_name] = [field]
+      else:
+        if tower_name in self._tower_dicts:
+          self._tower_dicts[tower_name].append(field)
+        else:
+          self._tower_dicts[tower_name] = [field]
+  def _is_str(self, v):
+    if isinstance(v, str):
+      return True
+    try:
+      if isinstance(v, unicode):  # noqa: F821
+        return True
+    except NameError:
+      return False
+    return False
+  def _parse_features(self):
+    df = pd.read_excel(self._excel_path, sheet_name='features')
+    for i, row in df.iterrows():
+      field = {}
+      name = field['name'] = row['name'].strip()
+      self._feature_names.append(name)
+      field['data_type'] = row['data_type'].strip()
+      field['type'] = row['type'].strip()
+      g = str(row['global']).strip()
+      if g and g != 'nan':
+        field['global'] = g
+      field['field_name'] = name
+      if row['type'].strip() == 'label':
+        self._label = name
+      if 'global' in field and field['global'] in self._dict_global:
+        # 如果是global 有值，就跳过
+        def _is_good(v):
+          return str(v) not in ['nan', '']
+        if _is_good(self._dict_global[field['global']]['default_value']):
+          field['default_value'] = self._dict_global[
+              field['global']]['default_value']
+        if _is_good(self._dict_global[field['global']]['hash_bucket_size']):
+          field['hash_bucket_size'] = self._dict_global[
+              field['global']]['hash_bucket_size']
+        if _is_good(self._dict_global[field['global']]['embedding_dim']):
+          field['embedding_dim'] = self._dict_global[
+              field['global']]['embedding_dim']
+        field['embedding_name'] = field['global']
+      for t in [
+          'type', 'global', 'hash_bucket_size', 'embedding_dim',
+          'default_value', 'weights', 'boundaries'
+      ]:
+        if t not in row:
+          continue
+        v = row[t]
+        if v not in ['', ' ', 'NaN', np.NaN, np.NAN, 'nan']:
+          if self._is_str(v):
+            field[t] = v.strip()
+          elif not math.isnan(v):
+            field[t] = int(v)
+        if t == 'default_value' and t not in field:
+          field[t] = ''
+          if field['type'] == 'dense':
+            field[t] = 0.0
+      if field['type'] == 'weights':
+        field['default_value'] = '1'
+      tower_name = row['group']
+      if name in self._dict_global:
+        field['type'] = 'category'
+        field['hash_bucket_size'] = self._dict_global[name]['hash_bucket_size']
+        field['embedding_dim'] = self._dict_global[name]['embedding_dim']
+        field['default_value'] = self._dict_global[name]['default_value']
+      if field['data_type'] == 'bigint':
+        field['default_value'] = 0
+      elif field['data_type'] == 'double':
+        field['default_value'] = 0.0
+      if field['type'] not in ['notneed', 'not_need', 'not_needed']:
+        tower_name = str(tower_name).strip()
+        self._add_to_tower(tower_name, field)
+      self._feature_details[name] = field
+    # check that tag features weights are one of the fields
+    for name, config in self._feature_details.items():
+      if config['type'] == 'tags':
+        if 'weights' in config and config[
+            'weights'] not in self._feature_details:
+          raise ValueError(config['weights'] + ' not in field names')
+  def _write_train_eval_config(self, fout):
+    fout.write('train_input_path: "%s"\n' % self._train_input_path)
+    fout.write('eval_input_path: "%s"\n' % self._eval_input_path)
+    fout.write("""
+    model_dir: "%s"
+    train_config {
+      log_step_count_steps: 200
+      # fine_tune_checkpoint: ""
+      optimizer_config: {
+        adam_optimizer: {
+          learning_rate: {
+            exponential_decay_learning_rate {
+              initial_learning_rate: 0.0001
+              decay_steps: 10000
+              decay_factor: 0.5
+              min_learning_rate: 0.0000001
+            }
+          }
+        }
+      }
+      num_steps: 2000
+      sync_replicas: true
+    }
+    eval_config {
+      metrics_set: {
+           auc {}
+      }
+    }""" % self._model_dir)
+  def _write_deepfm_config(self, fout):
+    # write model_config
+    fout.write('model_config:{\n')
+    fout.write('  model_class: "DeepFM"\n')
+    # write feature group configs
+    tower_names = list(self._tower_dicts.keys())
+    tower_names.sort()
+    for tower_name in tower_names:
+      fout.write('  feature_groups: {\n')
+      fout.write('    group_name: "%s"\n' % tower_name)
+      curr_feas = self._tower_dicts[tower_name]
+      for fea in curr_feas:
+        if fea['type'] == 'weights':
+          continue
+        fout.write('    feature_names: "%s"\n' % fea['name'])
+      fout.write('    wide_deep:%s\n' % tower_name.upper())
+      fout.write('  }\n')
+    # write deepfm configs
+    fout.write("""
+      deepfm {
+        dnn {
+          hidden_units: [128, 64, 32]
+        }
+        final_dnn {
+          hidden_units: [128, 64]
+        }
+        wide_output_dim: 16
+        l2_regularization: 1e-5
+      }
+      embedding_regularization: 1e-5
+    }
+    """)
+  def _write_multi_tower_config(self, fout):
+    # write model_config
+    fout.write('model_config:{\n')
+    fout.write('  model_class: "MultiTower"\n')
+    # write each tower features
+    tower_names = list(self._tower_dicts.keys())
+    tower_names.sort()
+    for tower_name in tower_names:
+      fout.write('  feature_groups: {\n')
+      fout.write('    group_name: "%s"\n' % tower_name)
+      curr_feas = self._tower_dicts[tower_name]
+      for fea in curr_feas:
+        if fea['type'] == 'weights':
+          continue
+        fout.write('    feature_names: "%s"\n' % fea['name'])
+      fout.write('    wide_deep:DEEP\n')
+      fout.write('  }\n')
+    # write each tower dnn configs
+    fout.write('multi_tower { \n')
+    for tower_name in tower_names:
+      fout.write("""
+      towers {
+        input: "%s"
+        dnn {
+          hidden_units: [256, 192, 128]
+        }
+      }""" % tower_name)
+    fout.write("""
+        final_dnn {
+          hidden_units: [192, 128, 64]
+        }
+        l2_regularization: 1e-5
+      }
+      embedding_regularization: 1e-5
+    }""")
+  def _write_data_config(self, fout):
+    fout.write('data_config {\n')
+    fout.write('  separator: "%s"\n' % self._column_separator)
+    for name in self._feature_names:
+      fout.write('  input_fields: {\n')
+      fout.write('    input_name: "%s"\n' % name)
+      fout.write('    input_type: %s\n' %
+                 self._get_type_name(self._feature_details[name]['data_type']))
+      if 'default_value' in self._feature_details[name]:
+        fout.write('    default_val:"%s"\n' %
+                   self._feature_details[name]['default_value'])
+      fout.write('  }\n')
+    fout.write('  label_fields: "%s"\n' % self._label)
+    fout.write("""
+      batch_size: 1024
+      prefetch_size: 32
+      input_type: CSVInput
+    }""")
+  def _write_feature_config(self, fout):
+    for name in self._feature_names:
+      feature = self._feature_details[name]
+      if feature['type'] in ['weights', 'notneed', 'label']:
+        continue
+      if name == self._label:
+        continue
+      fout.write('feature_configs: {\n')
+      fout.write('  input_names: "%s"\n' % name)
+      if feature['type'] == 'category':
+        fout.write('  feature_type: IdFeature\n')
+        fout.write('  embedding_dim: %d\n' % feature['embedding_dim'])
+        fout.write('  hash_bucket_size: %d\n' % feature['hash_bucket_size'])
+        if 'embedding_name' in feature:
+          fout.write('  embedding_name: "%s"\n' % feature['embedding_name'])
+      elif feature['type'] == 'dense':
+        fout.write('  feature_type: RawFeature\n')
+        if self._model_type == 'deepfm':
+          assert feature[
+              'boundaries'] != '', 'raw features must be discretized by specifying boundaries'
+        if 'boundaries' in feature and feature['boundaries'] != '':
+          fout.write('  boundaries: [%s]\n' %
+                     str(feature['boundaries']).strip())
+          fout.write('  embedding_dim: %d\n' % int(feature['embedding_dim']))
+      elif feature['type'] == 'tags':
+        if 'weights' in feature:
+          fout.write('  input_names: "%s"\n' % feature['weights'])
+        fout.write('  feature_type: TagFeature\n')
+        fout.write('  hash_bucket_size: %d\n' % feature['hash_bucket_size'])
+        fout.write('  embedding_dim: %d\n' % feature['embedding_dim'])
+        if 'embedding_name' in feature:
+          fout.write('  embedding_name: "%s"\n' % feature['embedding_name'])
+        fout.write('  separator: "%s"\n' % self._incol_separator)
+      elif feature['type'] == 'indexes':
+        fout.write('  feature_type: TagFeature\n')
+        assert 'hash_bucket_size' in feature
+        fout.write('  num_buckets: %d\n' % feature['hash_bucket_size'])
+        if 'embedding_dim' in feature:
+          fout.write('  embedding_dim: %d\n' % feature['embedding_dim'])
+        if 'embedding_name' in feature:
+          fout.write('  embedding_name: "%s"\n' % feature['embedding_name'])
+        fout.write('  separator: "%s"\n' % self._incol_separator)
+      else:
+        assert False, 'invalid feature types: %s' % feature['type']
+      fout.write('}\n')
+  def convert(self):
+    self._parse_features()
+    logging.info(
+        'TOWERS[%d]: %s' %
+        (len(self._tower_dicts), ','.join(list(self._tower_dicts.keys()))))
+    with open(self._output_path, 'w') as fout:
+      self._write_train_eval_config(fout)
+      self._write_data_config(fout)
+      self._write_feature_config(fout)
+      if self._model_type == 'deepfm':
+        self._write_deepfm_config(fout)
+      elif self._model_type == 'multi_tower':
+        self._write_multi_tower_config(fout)
+      else:
+        logging.warning(
+            'the model_config could not be generated automatically, you have to write the model_config manually.'
+        )
+    # reformat the config
+    pipeline_config = config_util.get_configs_from_pipeline_file(
+        self._output_path)
+    config_util.save_message(pipeline_config, self._output_path)
+model_types = ['deepfm', 'multi_tower']
+if __name__ == '__main__':
+  import argparse
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--model_type',
+      type=str,
+      choices=model_types,
+      help='model type, currently support: %s' % ','.join(model_types))
+  parser.add_argument('--excel_path', type=str, help='excel config path')
+  parser.add_argument('--output_path', type=str, help='generated config path')
+  parser.add_argument(
+      '--column_separator',
+      type=str,
+      default=',',
+      help='column separator, separator betwen features')
+  parser.add_argument(
+      '--incol_separator',
+      type=str,
+      default='|',
+      help='separator within features, such as tag features')
+  parser.add_argument(
+      '--train_input_path', type=str, default='', help='train input path')
+  parser.add_argument(
+      '--eval_input_path', type=str, default='', help='eval input path')
+  parser.add_argument('--model_dir', type=str, default='', help='model dir')
+  args = parser.parse_args()
+  if not args.excel_path or not args.output_path:
+    parser.print_usage()
+    sys.exit(1)
+  logging.info('column_separator = %s in_column_separator = %s' %
+               (args.column_separator, args.incol_separator))
+  converter = ModelConfigConverter(args.excel_path, args.output_path,
+                                   args.model_type, args.column_separator,
+                                   args.incol_separator, args.train_input_path,
+                                   args.eval_input_path, args.model_dir)
+  converter.convert()
+  logging.info('Conversion done')
+  logging.info('Tips:')
+  if args.train_input_path == '' or args.eval_input_path == '':
+    logging.info('*.you have to update train_input_path,  eval_input_path')
+  logging.info('*.you may need to adjust dnn config or final_dnn config')

easy_rec/python/tools/criteo/__init__.py ADDED Viewed

File without changes

easy_rec/python/tools/criteo/convert_data.py ADDED Viewed

@@ -0,0 +1,157 @@
+# -*- encoding:utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import argparse
+import gzip
+import logging
+import multiprocessing
+import os
+import traceback
+import numpy as np
+import pandas as pd
+import six
+from tensorflow.python.platform import gfile
+logging.basicConfig(
+    level=logging.INFO, format='[%(asctime)s][%(levelname)s] %(message)s')
+def save_np_bin(labels, dense_arr, cate_arr, prefix):
+  with gfile.GFile(prefix + '_label.bin', 'wb') as fout:
+    fout.write(np.array(labels, dtype=np.int32).tobytes())
+  with gfile.GFile(prefix + '_dense.bin', 'wb') as fout:
+    fout.write(np.array(dense_arr, dtype=np.float32).tobytes())
+  with gfile.GFile(prefix + '_category.bin', 'wb') as fout:
+    fout.write(np.array(cate_arr, dtype=np.float32).tobytes())
+def save_parquet(labels, dense_arr, cate_arr, prefix):
+  df = {'is_click': labels}
+  for i in range(1, 14):
+    df['f' + str(i)] = dense_arr[:, i - 1]
+  for i in range(1, 27):
+    df['c' + str(i)] = cate_arr[:, i - 1]
+  df = pd.DataFrame(df)
+  save_path = prefix + '.parquet'
+  logging.info('save to %s' % save_path)
+  df.to_parquet(save_path)
+def convert(input_path, prefix, part_record_num, save_format):
+  logging.info('start to convert %s, part_record_num=%d, save_format=%s' %
+               (input_path, part_record_num, save_format))
+  save_func = save_np_bin
+  if save_format == 'parquet':
+    save_func = save_parquet
+  batch_size = part_record_num
+  labels = np.zeros([batch_size], dtype=np.int32)
+  dense_arr = np.zeros([batch_size, 13], dtype=np.float32)
+  cate_arr = np.zeros([batch_size, 26], dtype=np.uint32)
+  part_id = 0
+  total_line = 0
+  try:
+    sid = 0
+    with gfile.GFile(input_path, 'rb') as gz_fin:
+      for line_str in gzip.GzipFile(fileobj=gz_fin, mode='rb'):
+        if six.PY3:
+          line_str = str(line_str, 'utf-8')
+        line_str = line_str.strip()
+        line_toks = line_str.split('\t')
+        labels[sid] = int(line_toks[0])
+        for j in range(1, 14):
+          x = line_toks[j]
+          dense_arr[sid, j - 1] = float(x) if x != '' else 0.0
+        for j in range(14, 40):
+          x = line_toks[j]
+          cate_arr[sid, j - 14] = int(x, 16) if x != '' else 0
+        sid += 1
+        if sid == batch_size:
+          save_func(labels, dense_arr, cate_arr, prefix + '_' + str(part_id))
+          logging.info('\t%s write part: %d' % (input_path, part_id))
+          part_id += 1
+          total_line += sid
+          sid = 0
+    if sid > 0:
+      save_func(labels[:sid], dense_arr[:sid], cate_arr[:sid],
+                prefix + '_' + str(part_id))
+      logging.info('\t%s write final part: %d' % (input_path, part_id))
+      part_id += 1
+      total_line += sid
+  except Exception as ex:
+    logging.error('convert %s failed: %s' % (input_path, str(ex)))
+    logging.error(traceback.format_exc())
+    return
+  logging.info('done convert %s, total_line=%d, part_num=%d' %
+               (input_path, total_line, part_id))
+if __name__ == '__main__':
+  """Convert criteo 1T data to binary format.
+  The outputs are stored in multiple parts, each with at most part_record_num samples.
+  Each part consists of 3 files:
+      xxx_yyy_label.bin,
+      xxx_yyy_dense.bin,
+      xxx_yyy_category.bin,
+  xxx is in range [0-23], range of yyy is determined by part_record_num,
+  If part_record_num is set to the default value 8M, there will be 535 parts. We convert
+  the data on machine with 64GB memory, if you memory is limited, you can convert the .gz
+  files one by one, or you can set a small part_record_num.
+  """
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--input_dir', type=str, default=None, help='criteo 1t data dir')
+  parser.add_argument(
+      '--save_dir',
+      type=str,
+      default=None,
+      help='criteo binary data output dir ')
+  parser.add_argument(
+      '--save_format',
+      type=str,
+      default='npy',
+      help='save format, choices: npy|parquet')
+  parser.add_argument(
+      '--part_record_num',
+      type=int,
+      default=1024 * 1024 * 8,
+      help='the maximal number of samples in each binary file')
+  parser.add_argument(
+      '--dt',
+      nargs='*',
+      type=int,
+      help='select days to convert, default to select all: 0-23')
+  args = parser.parse_args()
+  assert args.input_dir, 'input_dir is not set'
+  assert args.save_dir, 'save_dir is not set'
+  save_dir = args.save_dir
+  if not save_dir.endswith('/'):
+    save_dir = save_dir + '/'
+  if not gfile.IsDirectory(save_dir):
+    gfile.MakeDirs(save_dir)
+  if args.dt is None or len(args.dt) == 0:
+    days = list(range(0, 24))
+  else:
+    days = list(args.dt)
+  proc_arr = []
+  for d in days:
+    input_path = os.path.join(args.input_dir, 'day_%d.gz' % d)
+    prefix = os.path.join(args.save_dir, str(d))
+    proc = multiprocessing.Process(
+        target=convert,
+        args=(input_path, prefix, args.part_record_num, args.save_format))
+    convert(input_path, prefix, args.part_record_num, args.save_format)
+    proc.start()
+    proc_arr.append(proc)
+  for proc in proc_arr:
+    proc.join()