reproto 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .git/COMMIT_EDITMSG +1 -1
- .git/index +0 -0
- .git/logs/HEAD +3 -0
- .git/logs/refs/heads/iyue +3 -0
- .git/logs/refs/remotes/gitlab/iyue +3 -0
- .git/logs/refs/remotes/origin/iyue +3 -0
- .git/objects/15/eb3f02479e633439ec83c143e703f8448043a1 +0 -0
- .git/objects/20/cf56ec106bcd66420dd000279f983571b918b6 +0 -0
- .git/objects/21/55b64d52922c88527c102d62f23e5c2abbae79 +0 -0
- .git/objects/26/1f67f3b731b32f6d77de9dd7be2d61e2a14ace +0 -0
- .git/objects/2e/2c1c42f5ac5d665cc672d3792078b756d9ab0e +0 -0
- .git/objects/33/52dfa8f5d9eb46cc98ea7ccecf02e4d9df95f7 +0 -0
- .git/objects/35/8bace20b731ff1bbb256d2a0158dfc84720978 +0 -0
- .git/objects/3c/6f0120229cc2cd8123efbeb7f186eb0a485f29 +0 -0
- .git/objects/4d/6d457bfabc4af842e5ddc2d56eb059d5dfdc9d +0 -0
- .git/objects/55/6723fdd4f525eed41c52fa80defca3f0c81c47 +0 -0
- .git/objects/65/a4f0ada7519f8b1e6a7c7e287541b8effde9fd +0 -0
- .git/objects/76/311aa8e59d780763e0d66787067cc5d9613a67 +0 -0
- .git/objects/8c/809c42c7ae13007fd885ee7bcffae7acf2c520 +0 -0
- .git/objects/8d/44142ae2d6dbb59d4ebed8587bccd051e5766b +0 -0
- .git/objects/8d/4a5767bef0c342f1660526f9671c0944922c40 +0 -0
- .git/objects/95/295a15779ebefd563ec777c3d3cced7e8d0209 +0 -0
- .git/objects/97/56fe0931216a7c40cbf250e1ab8a6dfd589f13 +0 -0
- .git/objects/9a/e313cdf64cd82416c1238eb493e6396f799f12 +0 -0
- .git/objects/cd/2d6c229438c6b1c694b9392a85888d89ef49c1 +0 -0
- .git/objects/db/beedb30613f79ae3ff67df1428cf8ade223711 +0 -0
- .git/objects/e8/1433b6ad92206cdadbee1f474b4f99383314cb +0 -0
- .git/objects/e9/a15996cb55ac72aeb6611d26e8d22246589943 +0 -0
- .git/objects/f7/25a430eb3364460ba854dbc8809edc21dc6c70 +0 -0
- .git/objects/fc/e15b9dbffd9f37b1f2d46944ee2d0394df6565 +2 -0
- .git/refs/heads/iyue +1 -1
- .git/refs/remotes/gitlab/iyue +1 -1
- .git/refs/remotes/origin/iyue +1 -1
- README.md +36 -116
- core/info_decoder.py +512 -105
- core/reconstructor.py +645 -84
- generation/proto_generator.py +38 -12
- main.py +36 -5
- parsing/java_parser.py +81 -1
- pyproject.toml +13 -2
- {reproto-0.0.6.dist-info → reproto-0.0.8.dist-info}/METADATA +46 -119
- {reproto-0.0.6.dist-info → reproto-0.0.8.dist-info}/RECORD +46 -20
- utils/file_cache.py +165 -0
- utils/type_index.py +341 -0
- {reproto-0.0.6.dist-info → reproto-0.0.8.dist-info}/WHEEL +0 -0
- {reproto-0.0.6.dist-info → reproto-0.0.8.dist-info}/entry_points.txt +0 -0
core/info_decoder.py
CHANGED
@@ -29,12 +29,17 @@ class InfoDecoder:
|
|
29
29
|
|
30
30
|
def __init__(self, java_source_analyzer=None):
|
31
31
|
"""
|
32
|
-
|
32
|
+
初始化信息解码器
|
33
33
|
|
34
34
|
Args:
|
35
|
-
java_source_analyzer: Java
|
35
|
+
java_source_analyzer: Java源码分析器实例(可选)
|
36
36
|
"""
|
37
37
|
self.logger = get_logger("info_decoder")
|
38
|
+
self.java_source_analyzer = java_source_analyzer
|
39
|
+
|
40
|
+
# 导入JavaParser
|
41
|
+
from parsing.java_parser import JavaParser
|
42
|
+
self.java_parser = JavaParser()
|
38
43
|
|
39
44
|
# Protobuf字段类型映射表
|
40
45
|
# 键:字节码中的类型值,值:对应的protobuf字段类型
|
@@ -44,51 +49,62 @@ class InfoDecoder:
|
|
44
49
|
2: 'int64', # INT64
|
45
50
|
3: 'int32', # INT32
|
46
51
|
4: 'int32', # INT32 (修正:4对应int32,不是bool)
|
52
|
+
5: 'int64', # INT64 - 基于Models$Onboarded.userId_和phoneNumber_的分析
|
53
|
+
6: 'int32', # INT32 - 基于Assistant$Payload.action_的分析
|
47
54
|
7: 'bool', # BOOL (修正:7对应bool)
|
48
55
|
9: 'message', # MESSAGE (嵌套消息)
|
49
56
|
12: 'enum', # ENUM (枚举类型)
|
50
|
-
27: '
|
51
|
-
39: '
|
52
|
-
44: '
|
57
|
+
27: 'repeated_message', # REPEATED MESSAGE (修正:27表示repeated message)
|
58
|
+
39: 'repeated_int32', # REPEATED INT32 (packed)
|
59
|
+
44: 'repeated_enum', # PACKED ENUM (修正:44表示repeated enum)
|
53
60
|
50: 'map', # Map字段 - 基于BulkSearchResult.contacts的分析
|
61
|
+
92: 'string', # STRING - 基于Assistant$Payload.title_的分析
|
54
62
|
520: 'string', # UTF-8字符串
|
55
|
-
538: '
|
63
|
+
538: 'repeated_string', # REPEATED STRING (Ț = 538)
|
64
|
+
4100: 'int32', # INT32 - 基于Assistant$Payload.action_的分析
|
65
|
+
4108: 'enum', # ENUM - 基于Assistant$Payload.payloadType_的分析
|
66
|
+
4616: 'string', # STRING - 基于Assistant$Payload.summary_的分析
|
56
67
|
}
|
57
68
|
|
58
|
-
#
|
59
|
-
self.
|
60
|
-
|
61
|
-
# 统计未知类型(用于持续改进)
|
62
|
-
self.unknown_types_stats = {} # {byte_code: count}
|
69
|
+
# 统计未知字节码类型
|
70
|
+
self.unknown_types_stats = {}
|
63
71
|
|
64
|
-
def decode_message_info(self, class_name: str, info_string: str, objects: List[str]) -> Optional[MessageDefinition]:
|
72
|
+
def decode_message_info(self, class_name: str, info_string: str, objects: List[str], java_file_path=None) -> Optional[MessageDefinition]:
|
65
73
|
"""
|
66
|
-
|
74
|
+
解码Protobuf消息信息
|
67
75
|
|
68
76
|
Args:
|
69
77
|
class_name: 完整的Java类名
|
70
78
|
info_string: newMessageInfo中的字节码字符串
|
71
79
|
objects: newMessageInfo中的对象数组
|
80
|
+
java_file_path: Java文件路径(用于提取字段标签)
|
72
81
|
|
73
82
|
Returns:
|
74
83
|
MessageDefinition对象 或 None(如果解码失败)
|
75
84
|
"""
|
76
85
|
try:
|
77
|
-
#
|
86
|
+
# 解码字节码字符串
|
78
87
|
bytes_data = self._decode_info_string(info_string)
|
79
|
-
if
|
88
|
+
if bytes_data is None:
|
80
89
|
return None
|
81
90
|
|
82
|
-
#
|
91
|
+
# 创建消息定义
|
83
92
|
message_def = self._create_message_definition(class_name)
|
84
93
|
|
85
|
-
#
|
86
|
-
|
94
|
+
# 提取字段标签(如果有Java文件路径)
|
95
|
+
field_tags = None
|
96
|
+
if java_file_path:
|
97
|
+
field_tags = self.java_parser.extract_field_tags(java_file_path)
|
98
|
+
if field_tags:
|
99
|
+
self.logger.info(f" 🏷️ 从Java源码提取到 {len(field_tags)} 个字段标签")
|
100
|
+
|
101
|
+
# 解析字段信息
|
102
|
+
self._parse_fields(message_def, bytes_data, objects, field_tags)
|
87
103
|
|
88
104
|
return message_def
|
89
105
|
|
90
106
|
except Exception as e:
|
91
|
-
self.logger.error(f"❌
|
107
|
+
self.logger.error(f"❌ 解码消息信息失败: {e}")
|
92
108
|
return None
|
93
109
|
|
94
110
|
def _decode_info_string(self, info_string: str) -> Optional[List[int]]:
|
@@ -102,12 +118,27 @@ class InfoDecoder:
|
|
102
118
|
字节数组 或 None(如果解码失败)
|
103
119
|
"""
|
104
120
|
try:
|
105
|
-
#
|
106
|
-
|
107
|
-
|
121
|
+
# 首先解码Unicode转义序列(如\u0000)但保持Unicode字符的原始值
|
122
|
+
# 使用raw_unicode_escape来避免将Unicode字符编码为UTF-8
|
123
|
+
decoded_string = info_string.encode('raw_unicode_escape').decode('raw_unicode_escape')
|
124
|
+
return [ord(c) for c in decoded_string]
|
108
125
|
except Exception as e:
|
109
|
-
|
110
|
-
|
126
|
+
try:
|
127
|
+
# 如果包含转义序列,手动处理
|
128
|
+
import re
|
129
|
+
def replace_unicode_escape(match):
|
130
|
+
return chr(int(match.group(1), 16))
|
131
|
+
|
132
|
+
# 替换\uXXXX格式的转义序列
|
133
|
+
decoded_string = re.sub(r'\\u([0-9a-fA-F]{4})', replace_unicode_escape, info_string)
|
134
|
+
return [ord(c) for c in decoded_string]
|
135
|
+
except Exception as e2:
|
136
|
+
try:
|
137
|
+
# 最后的备用方法:直接使用ord值
|
138
|
+
return [ord(c) for c in info_string]
|
139
|
+
except Exception as e3:
|
140
|
+
self.logger.error(f"❌ 解码字节码字符串失败: {e}, 方法2: {e2}, 方法3: {e3}")
|
141
|
+
return None
|
111
142
|
|
112
143
|
def _create_message_definition(self, class_name: str) -> MessageDefinition:
|
113
144
|
"""
|
@@ -130,7 +161,7 @@ class InfoDecoder:
|
|
130
161
|
full_name=class_name
|
131
162
|
)
|
132
163
|
|
133
|
-
def _parse_fields(self, message_def: MessageDefinition, bytes_data: List[int], objects: List[str]) -> None:
|
164
|
+
def _parse_fields(self, message_def: MessageDefinition, bytes_data: List[int], objects: List[str], field_tags: Optional[dict] = None) -> None:
|
134
165
|
"""
|
135
166
|
解析字段信息的主调度方法
|
136
167
|
|
@@ -138,6 +169,7 @@ class InfoDecoder:
|
|
138
169
|
message_def: 消息定义对象
|
139
170
|
bytes_data: 解码后的字节数组
|
140
171
|
objects: 对象数组
|
172
|
+
field_tags: 字段标签映射 {field_name: tag}
|
141
173
|
"""
|
142
174
|
try:
|
143
175
|
# 检查是否包含oneof字段(通过查找'<'字符,ord=60)
|
@@ -146,12 +178,12 @@ class InfoDecoder:
|
|
146
178
|
if oneof_positions:
|
147
179
|
self._parse_oneof_fields(message_def, bytes_data, objects, oneof_positions)
|
148
180
|
else:
|
149
|
-
self._parse_regular_fields(message_def, bytes_data, objects)
|
181
|
+
self._parse_regular_fields(message_def, bytes_data, objects, field_tags)
|
150
182
|
|
151
183
|
except Exception as e:
|
152
184
|
self.logger.error(f"❌ 解析字段失败: {e}")
|
153
185
|
|
154
|
-
def _parse_regular_fields(self, message_def: MessageDefinition, bytes_data: List[int], objects: List[str]) -> None:
|
186
|
+
def _parse_regular_fields(self, message_def: MessageDefinition, bytes_data: List[int], objects: List[str], field_tags: Optional[dict] = None) -> None:
|
155
187
|
"""
|
156
188
|
解析常规字段(非oneof字段)
|
157
189
|
|
@@ -159,16 +191,267 @@ class InfoDecoder:
|
|
159
191
|
message_def: 消息定义对象
|
160
192
|
bytes_data: 字节码数据
|
161
193
|
objects: 对象数组
|
194
|
+
field_tags: 字段标签映射 {field_name: tag}
|
162
195
|
"""
|
163
196
|
# 跳过前10个字节的元数据
|
164
197
|
field_start = 10
|
165
198
|
object_index = 0
|
166
199
|
|
200
|
+
self.logger.info(f" 📊 开始解析字段,字节码长度: {len(bytes_data)}, objects数组长度: {len(objects)}")
|
201
|
+
self.logger.info(f" 📊 完整字节码数据: {[f'{b:02x}' for b in bytes_data]}")
|
202
|
+
self.logger.info(f" 📊 Objects数组: {objects}")
|
203
|
+
|
204
|
+
# 如果有字段标签,优先使用Java源码信息
|
205
|
+
if field_tags:
|
206
|
+
self.logger.info(f" 🏷️ 使用Java源码字段标签: {field_tags}")
|
207
|
+
self._parse_fields_with_java_tags(message_def, bytes_data, objects, field_tags)
|
208
|
+
else:
|
209
|
+
# 回退到字节码解析
|
210
|
+
self.logger.info(f" 🔍 回退到字节码解析")
|
211
|
+
self._parse_fields_from_bytecode(message_def, bytes_data, objects, field_start)
|
212
|
+
|
213
|
+
self.logger.info(f" 📊 字段解析完成,共解析 {len(message_def.fields)} 个字段")
|
214
|
+
|
215
|
+
def _parse_fields_with_java_tags(self, message_def: MessageDefinition, bytes_data: List[int], objects: List[str], field_tags: dict) -> None:
|
216
|
+
"""
|
217
|
+
使用Java源码提取的字段标签解析字段
|
218
|
+
|
219
|
+
Args:
|
220
|
+
message_def: 消息定义对象
|
221
|
+
bytes_data: 字节码数据
|
222
|
+
objects: 对象数组
|
223
|
+
field_tags: Java源码提取的字段标签映射
|
224
|
+
"""
|
225
|
+
for field_name_raw, field_tag in field_tags.items():
|
226
|
+
# 清理字段名
|
227
|
+
field_name = self._clean_field_name(field_name_raw)
|
228
|
+
|
229
|
+
# 从Java源码获取字段类型
|
230
|
+
# 首先尝试作为枚举类型获取
|
231
|
+
java_type = self._get_real_field_type_from_source(field_name_raw, 'enum')
|
232
|
+
if not java_type:
|
233
|
+
# 如果枚举类型获取失败,再尝试作为消息类型获取
|
234
|
+
java_type = self._get_real_field_type_from_source(field_name_raw, 'message')
|
235
|
+
if java_type:
|
236
|
+
# 使用Java源码类型,直接处理原始Java类型
|
237
|
+
if java_type.startswith('Internal.ProtobufList<') and java_type.endswith('>'):
|
238
|
+
# Internal.ProtobufList<Contact> -> Contact (repeated)
|
239
|
+
element_type = java_type[len('Internal.ProtobufList<'):-1]
|
240
|
+
field_type_name = self._convert_java_to_proto_type(element_type)
|
241
|
+
rule = 'repeated'
|
242
|
+
elif java_type.startswith('MapFieldLite<') and java_type.endswith('>'):
|
243
|
+
# MapFieldLite<String, Contact> -> map<string, Contact>
|
244
|
+
field_type_name = self._convert_java_to_proto_type(java_type)
|
245
|
+
rule = 'optional'
|
246
|
+
elif java_type == 'Internal.IntList':
|
247
|
+
# Internal.IntList -> 需要从setter方法获取真正的枚举类型
|
248
|
+
if self.java_source_analyzer:
|
249
|
+
enum_type = self.java_source_analyzer._get_enum_type_from_list_setter(field_name_raw.rstrip('_'))
|
250
|
+
if enum_type:
|
251
|
+
# 获取到枚举类型,转换为简单类名
|
252
|
+
field_type_name = self._convert_java_to_proto_type(enum_type)
|
253
|
+
rule = 'repeated'
|
254
|
+
else:
|
255
|
+
# 如果获取不到,回退到默认处理
|
256
|
+
field_type_name = 'int32'
|
257
|
+
rule = 'repeated'
|
258
|
+
else:
|
259
|
+
field_type_name = 'int32'
|
260
|
+
rule = 'repeated'
|
261
|
+
else:
|
262
|
+
# 普通类型 - 但需要检查是否为枚举类型
|
263
|
+
if java_type in ['int', 'long', 'short', 'byte'] and self.java_source_analyzer:
|
264
|
+
# 对于基础整数类型,检查是否有对应的枚举setter方法
|
265
|
+
enum_type = self.java_source_analyzer._get_type_from_setter(field_name_raw.rstrip('_'))
|
266
|
+
if enum_type:
|
267
|
+
# 找到枚举setter,使用枚举类型
|
268
|
+
field_type_name = self._convert_java_to_proto_type(enum_type)
|
269
|
+
rule = 'optional'
|
270
|
+
else:
|
271
|
+
# 没有枚举setter,使用基础类型
|
272
|
+
field_type_name = self._convert_java_to_proto_type(java_type)
|
273
|
+
rule = 'optional'
|
274
|
+
else:
|
275
|
+
# 非基础整数类型,正常处理
|
276
|
+
field_type_name = self._convert_java_to_proto_type(java_type)
|
277
|
+
# 判断是否为repeated类型
|
278
|
+
if (java_type.startswith('Internal.ProtobufList<') or
|
279
|
+
java_type.startswith('List<') or
|
280
|
+
java_type.startswith('ArrayList<')):
|
281
|
+
rule = 'repeated'
|
282
|
+
else:
|
283
|
+
rule = 'optional'
|
284
|
+
|
285
|
+
self.logger.info(f" 🔍 从Java源码获取类型: {field_name_raw} -> {java_type} -> {field_type_name} (rule: {rule})")
|
286
|
+
else:
|
287
|
+
# 使用默认类型
|
288
|
+
field_type_name = 'string'
|
289
|
+
rule = 'optional'
|
290
|
+
self.logger.info(f" 🔍 使用默认类型: {field_name_raw} -> {field_type_name}")
|
291
|
+
|
292
|
+
# 记录字段信息
|
293
|
+
self.logger.info(f" 📝 字段信息: name={field_name}, type={field_type_name}, tag={field_tag}")
|
294
|
+
|
295
|
+
# 特殊情况处理:根据字段名修正类型
|
296
|
+
field_type_name = self._refine_field_type(field_name, field_type_name, 0) # 使用0作为占位符
|
297
|
+
|
298
|
+
# 确定字段规则(基于Java类型判断是否为repeated)
|
299
|
+
# 已经在上面确定了rule,这里不需要重复处理
|
300
|
+
|
301
|
+
# 创建字段定义
|
302
|
+
field_def = FieldDefinition(
|
303
|
+
name=field_name,
|
304
|
+
type_name=field_type_name,
|
305
|
+
tag=field_tag,
|
306
|
+
rule=rule
|
307
|
+
)
|
308
|
+
|
309
|
+
message_def.fields.append(field_def)
|
310
|
+
self.logger.info(f" ✅ 添加字段: {field_name} = {field_tag} ({rule} {field_type_name})")
|
311
|
+
|
312
|
+
def _determine_field_rule(self, field_type_byte: int, field_type_name: str = None, java_type: str = None) -> str:
|
313
|
+
"""
|
314
|
+
根据字节码、字段类型和Java类型确定字段规则
|
315
|
+
|
316
|
+
Args:
|
317
|
+
field_type_byte: 字段类型字节
|
318
|
+
field_type_name: 字段类型名(可选)
|
319
|
+
java_type: Java源码中的类型(可选)
|
320
|
+
|
321
|
+
Returns:
|
322
|
+
字段规则:'optional' 或 'repeated'
|
323
|
+
"""
|
324
|
+
# map类型永远不使用repeated规则,因为map本身就表示键值对集合
|
325
|
+
if field_type_name and field_type_name.startswith('map<'):
|
326
|
+
return 'optional'
|
327
|
+
|
328
|
+
# 检查Java源码类型是否为集合类型
|
329
|
+
if java_type:
|
330
|
+
if (java_type.startswith('Internal.ProtobufList<') or
|
331
|
+
java_type.startswith('List<') or
|
332
|
+
java_type.startswith('ArrayList<') or
|
333
|
+
java_type.startswith('java.util.List<')):
|
334
|
+
return 'repeated'
|
335
|
+
|
336
|
+
# 检查字段类型名是否包含repeated标识
|
337
|
+
if field_type_name and field_type_name.startswith('repeated_'):
|
338
|
+
return 'repeated'
|
339
|
+
|
340
|
+
# repeated类型的字节码
|
341
|
+
repeated_types = {27, 39, 44, 538} # repeated_message, repeated_int32, repeated_enum, repeated_string
|
342
|
+
return 'repeated' if field_type_byte in repeated_types else 'optional'
|
343
|
+
|
344
|
+
def _infer_field_type_from_bytecode(self, field_name_raw: str, field_type: str) -> str:
|
345
|
+
"""
|
346
|
+
从Java源码推断字段类型
|
347
|
+
|
348
|
+
Args:
|
349
|
+
field_name_raw: 原始字段名(带下划线)
|
350
|
+
field_type: 字节码推断的字段类型
|
351
|
+
|
352
|
+
Returns:
|
353
|
+
推断的字段类型
|
354
|
+
"""
|
355
|
+
# 首先尝试从Java源码获取真实类型
|
356
|
+
real_type = self._get_real_field_type_from_source(field_name_raw)
|
357
|
+
if real_type:
|
358
|
+
self.logger.info(f" 🔍 从Java源码获取类型: {field_name_raw} -> {real_type} -> {self._convert_java_to_proto_type(real_type)}")
|
359
|
+
return self._convert_java_to_proto_type(real_type)
|
360
|
+
|
361
|
+
# 如果源码分析失败,使用字节码类型
|
362
|
+
self.logger.info(f" 🔍 使用字节码类型: {field_name_raw} -> {field_type}")
|
363
|
+
return field_type
|
364
|
+
|
365
|
+
def _convert_java_to_proto_type(self, java_type: str) -> str:
|
366
|
+
"""
|
367
|
+
将Java类型转换为Protobuf类型
|
368
|
+
|
369
|
+
Args:
|
370
|
+
java_type: Java类型字符串
|
371
|
+
|
372
|
+
Returns:
|
373
|
+
转换后的Protobuf类型
|
374
|
+
"""
|
375
|
+
if not java_type:
|
376
|
+
return 'string'
|
377
|
+
|
378
|
+
# 处理Internal.ProtobufList<T>类型
|
379
|
+
if java_type.startswith('Internal.ProtobufList<') and java_type.endswith('>'):
|
380
|
+
element_type = java_type[len('Internal.ProtobufList<'):-1]
|
381
|
+
# 递归处理元素类型
|
382
|
+
return self._convert_java_to_proto_type(element_type)
|
383
|
+
|
384
|
+
# 处理MapFieldLite<K, V>类型,返回map<k, v>格式
|
385
|
+
if java_type.startswith('MapFieldLite<') and java_type.endswith('>'):
|
386
|
+
inner_types = java_type[len('MapFieldLite<'):-1]
|
387
|
+
# 解析键值类型
|
388
|
+
parts = self._parse_generic_types(inner_types)
|
389
|
+
if len(parts) == 2:
|
390
|
+
key_type = self._convert_java_to_proto_type(parts[0].strip())
|
391
|
+
value_type = self._convert_java_to_proto_type(parts[1].strip())
|
392
|
+
return f"map<{key_type}, {value_type}>"
|
393
|
+
|
394
|
+
# 处理List<T>类型
|
395
|
+
if java_type.startswith('List<') and java_type.endswith('>'):
|
396
|
+
element_type = java_type[len('List<'):-1]
|
397
|
+
return self._convert_java_to_proto_type(element_type)
|
398
|
+
|
399
|
+
# 处理Internal.IntList类型(通常对应枚举列表)
|
400
|
+
if java_type == 'Internal.IntList':
|
401
|
+
# 这种情况需要从上下文获取真正的枚举类型
|
402
|
+
# 返回特殊标记,让调用方进行进一步处理
|
403
|
+
return 'Internal.IntList'
|
404
|
+
|
405
|
+
# 基础类型映射
|
406
|
+
basic_types = {
|
407
|
+
'int': 'int32',
|
408
|
+
'long': 'int64',
|
409
|
+
'float': 'float',
|
410
|
+
'double': 'double',
|
411
|
+
'boolean': 'bool',
|
412
|
+
'String': 'string',
|
413
|
+
'java.lang.String': 'string',
|
414
|
+
'java.lang.Integer': 'int32',
|
415
|
+
'java.lang.Long': 'int64',
|
416
|
+
'java.lang.Float': 'float',
|
417
|
+
'java.lang.Double': 'double',
|
418
|
+
'java.lang.Boolean': 'bool',
|
419
|
+
'byte[]': 'bytes',
|
420
|
+
'ByteString': 'bytes',
|
421
|
+
'com.google.protobuf.ByteString': 'bytes',
|
422
|
+
}
|
423
|
+
|
424
|
+
# 检查是否为基础类型
|
425
|
+
if java_type in basic_types:
|
426
|
+
return basic_types[java_type]
|
427
|
+
|
428
|
+
# 如果是完整的类名,提取简单类名
|
429
|
+
if '.' in java_type:
|
430
|
+
simple_name = java_type.split('.')[-1]
|
431
|
+
return simple_name
|
432
|
+
|
433
|
+
# 默认返回原类型名
|
434
|
+
return java_type
|
435
|
+
|
436
|
+
def _parse_fields_from_bytecode(self, message_def: MessageDefinition, bytes_data: List[int], objects: List[str], field_start: int) -> None:
|
437
|
+
"""
|
438
|
+
从字节码解析字段(原有的解析逻辑)
|
439
|
+
|
440
|
+
Args:
|
441
|
+
message_def: 消息定义对象
|
442
|
+
bytes_data: 字节码数据
|
443
|
+
objects: 对象数组
|
444
|
+
field_start: 字段数据开始位置
|
445
|
+
"""
|
446
|
+
object_index = 0
|
447
|
+
|
167
448
|
# 每次处理2个字节:[字段标签, 字段类型]
|
168
449
|
for i in range(field_start, len(bytes_data) - 1, 2):
|
169
450
|
field_tag = bytes_data[i]
|
170
451
|
field_type_byte = bytes_data[i + 1]
|
171
452
|
|
453
|
+
self.logger.info(f" 🔍 处理字段 #{(i-field_start)//2 + 1}: tag={field_tag}, type_byte={field_type_byte} (0x{field_type_byte:02x})")
|
454
|
+
|
172
455
|
# 查找类型映射,对未知类型进行智能处理
|
173
456
|
if field_type_byte not in self.type_mapping:
|
174
457
|
# 统计未知类型
|
@@ -180,20 +463,24 @@ class InfoDecoder:
|
|
180
463
|
self.logger.info(f" 🔍 推断未知类型: {field_type_byte} -> {field_type}")
|
181
464
|
else:
|
182
465
|
field_type = self.type_mapping[field_type_byte]
|
466
|
+
self.logger.info(f" ✅ 已知类型: {field_type_byte} -> {field_type}")
|
183
467
|
|
184
468
|
# 从对象数组获取字段信息
|
185
469
|
field_info = self._extract_field_info(objects, object_index, field_type)
|
186
470
|
if not field_info:
|
471
|
+
self.logger.warning(f" ⚠️ 无法获取字段信息,跳过字段 tag={field_tag}")
|
187
472
|
continue
|
188
473
|
|
189
474
|
field_name, field_type_name, new_object_index = field_info
|
190
475
|
object_index = new_object_index
|
191
476
|
|
477
|
+
self.logger.info(f" 📝 字段信息: name={field_name}, type={field_type_name}, tag={field_tag}")
|
478
|
+
|
192
479
|
# 特殊情况处理:根据字段名修正类型
|
193
480
|
field_type_name = self._refine_field_type(field_name, field_type_name, field_type_byte)
|
194
481
|
|
195
482
|
# 确定字段规则
|
196
|
-
rule = self._determine_field_rule(field_type_byte)
|
483
|
+
rule = self._determine_field_rule(field_type_byte, field_type_name, None)
|
197
484
|
|
198
485
|
# 创建字段定义
|
199
486
|
field_def = FieldDefinition(
|
@@ -204,6 +491,7 @@ class InfoDecoder:
|
|
204
491
|
)
|
205
492
|
|
206
493
|
message_def.fields.append(field_def)
|
494
|
+
self.logger.info(f" ✅ 添加字段: {field_name} = {field_tag} ({field_type_name})")
|
207
495
|
|
208
496
|
def _extract_field_info(self, objects: List[str], object_index: int, field_type: str) -> Optional[tuple]:
|
209
497
|
"""
|
@@ -224,14 +512,27 @@ class InfoDecoder:
|
|
224
512
|
|
225
513
|
# 获取字段名
|
226
514
|
field_name_raw = objects[object_index]
|
515
|
+
|
516
|
+
# 跳过内部状态字段(protobuf内部使用的字段,不是实际的proto字段)
|
517
|
+
if self._is_internal_field(field_name_raw):
|
518
|
+
self.logger.info(f" ⏭️ 跳过内部字段: {field_name_raw}")
|
519
|
+
object_index += 1
|
520
|
+
# 递归调用获取下一个字段
|
521
|
+
return self._extract_field_info(objects, object_index, field_type)
|
522
|
+
|
227
523
|
field_name = self._to_snake_case(field_name_raw.rstrip('_'))
|
228
524
|
object_index += 1
|
229
525
|
|
230
526
|
# 确定字段类型名
|
231
527
|
field_type_name = field_type # 默认使用基础类型
|
232
528
|
|
529
|
+
# 处理repeated类型:repeated_message -> message,但保留repeated信息
|
530
|
+
if field_type.startswith('repeated_'):
|
531
|
+
base_field_type = field_type[9:] # 移除 'repeated_' 前缀
|
532
|
+
field_type_name = base_field_type
|
533
|
+
|
233
534
|
# 对于消息类型、枚举类型和map类型,检查objects数组中是否有具体的类型引用
|
234
|
-
if
|
535
|
+
if field_type_name in ['message', 'enum', 'map'] or field_type in ['repeated_message', 'repeated_enum']:
|
235
536
|
if object_index < len(objects):
|
236
537
|
next_obj = objects[object_index]
|
237
538
|
if self._is_type_reference(next_obj):
|
@@ -246,16 +547,16 @@ class InfoDecoder:
|
|
246
547
|
object_index += 1
|
247
548
|
else:
|
248
549
|
# 没有显式引用,优先从Java源码中获取真实类型
|
249
|
-
real_type = self._get_real_field_type_from_source(field_name_raw,
|
550
|
+
real_type = self._get_real_field_type_from_source(field_name_raw, field_type_name)
|
250
551
|
if real_type:
|
251
552
|
field_type_name = real_type
|
252
553
|
self.logger.info(f" 🔍 源码获取类型: {field_name} -> {field_type_name}")
|
253
554
|
else:
|
254
555
|
# 如果源码分析失败,才进行智能推断
|
255
|
-
if
|
556
|
+
if field_type_name == 'enum':
|
256
557
|
field_type_name = self._infer_enum_type_from_field_name(field_name_raw)
|
257
558
|
self.logger.info(f" 🔍 推断枚举类型: {field_name} -> {field_type_name}")
|
258
|
-
elif
|
559
|
+
elif field_type_name == 'message':
|
259
560
|
field_type_name = self._infer_message_type_from_field_name(field_name_raw)
|
260
561
|
self.logger.info(f" 🔍 推断消息类型: {field_name} -> {field_type_name}")
|
261
562
|
elif field_type == 'map':
|
@@ -263,16 +564,16 @@ class InfoDecoder:
|
|
263
564
|
self.logger.info(f" 🔍 推断map类型: {field_name} -> {field_type_name}")
|
264
565
|
else:
|
265
566
|
# objects数组已结束,优先从Java源码中获取真实类型
|
266
|
-
real_type = self._get_real_field_type_from_source(field_name_raw,
|
567
|
+
real_type = self._get_real_field_type_from_source(field_name_raw, field_type_name)
|
267
568
|
if real_type:
|
268
569
|
field_type_name = real_type
|
269
570
|
self.logger.info(f" 🔍 源码获取类型: {field_name} -> {field_type_name}")
|
270
571
|
else:
|
271
572
|
# 如果源码分析失败,才进行智能推断
|
272
|
-
if
|
573
|
+
if field_type_name == 'enum':
|
273
574
|
field_type_name = self._infer_enum_type_from_field_name(field_name_raw)
|
274
575
|
self.logger.info(f" 🔍 推断枚举类型: {field_name} -> {field_type_name}")
|
275
|
-
elif
|
576
|
+
elif field_type_name == 'message':
|
276
577
|
field_type_name = self._infer_message_type_from_field_name(field_name_raw)
|
277
578
|
self.logger.info(f" 🔍 推断消息类型: {field_name} -> {field_type_name}")
|
278
579
|
elif field_type == 'map':
|
@@ -281,31 +582,34 @@ class InfoDecoder:
|
|
281
582
|
|
282
583
|
return field_name, field_type_name, object_index
|
283
584
|
|
284
|
-
def _get_real_field_type_from_source(self, field_name_raw: str, expected_type: str) -> Optional[str]:
|
585
|
+
def _get_real_field_type_from_source(self, field_name_raw: str, expected_type: str = 'message') -> Optional[str]:
|
285
586
|
"""
|
286
|
-
从Java
|
587
|
+
从Java源码中获取字段的真实Java类型(原始类型,不转换)
|
287
588
|
|
288
589
|
Args:
|
289
|
-
field_name_raw: 原始字段名(如
|
290
|
-
expected_type: 期望的基础类型(message 或
|
590
|
+
field_name_raw: 原始字段名(如 contacts_)
|
591
|
+
expected_type: 期望的基础类型(message、enum 或 map)
|
291
592
|
|
292
593
|
Returns:
|
293
|
-
|
594
|
+
原始的Java类型名,如果无法获取则返回None
|
294
595
|
"""
|
295
596
|
if not self.java_source_analyzer:
|
296
597
|
return None
|
297
598
|
|
298
599
|
try:
|
299
|
-
# 调用Java
|
600
|
+
# 调用Java源码分析器获取真实Java类型(原始类型)
|
300
601
|
real_type = self.java_source_analyzer.get_field_type(field_name_raw, expected_type)
|
301
|
-
|
602
|
+
if real_type:
|
603
|
+
self.logger.info(f" 🔍 源码分析成功: {field_name_raw} -> {real_type}")
|
604
|
+
return real_type # 返回原始Java类型
|
605
|
+
return None
|
302
606
|
except Exception as e:
|
303
607
|
self.logger.warning(f" ⚠️ 源码分析失败: {e}")
|
304
608
|
return None
|
305
609
|
|
306
610
|
def _infer_message_type_from_field_name(self, field_name_raw: str) -> str:
|
307
611
|
"""
|
308
|
-
|
612
|
+
根据字段名智能推断消息类型名(通用算法)
|
309
613
|
|
310
614
|
Args:
|
311
615
|
field_name_raw: 原始字段名(如 businessProfile_)
|
@@ -313,41 +617,107 @@ class InfoDecoder:
|
|
313
617
|
Returns:
|
314
618
|
推断出的消息类型名
|
315
619
|
"""
|
620
|
+
# 优先从Java源码中获取真实类型
|
621
|
+
if self.java_source_analyzer:
|
622
|
+
real_type = self.java_source_analyzer.get_field_type(field_name_raw, 'message')
|
623
|
+
if real_type and real_type not in ['string', 'int32', 'int64', 'bool', 'float', 'double', 'bytes']:
|
624
|
+
return real_type
|
625
|
+
|
316
626
|
# 移除末尾的下划线
|
317
627
|
clean_name = field_name_raw.rstrip('_')
|
318
628
|
|
319
629
|
if not clean_name:
|
320
630
|
return 'UnknownMessage'
|
321
631
|
|
632
|
+
# 检查是否为基础字段类型
|
633
|
+
if self._is_likely_basic_field(clean_name):
|
634
|
+
# 对于基础字段,返回相应的protobuf基础类型
|
635
|
+
return self._get_basic_field_proto_type(clean_name)
|
636
|
+
|
322
637
|
# 将camelCase转换为PascalCase
|
323
638
|
type_name = self._camel_to_pascal_case(clean_name)
|
324
639
|
|
325
640
|
# 通用推断规则(无硬编码)
|
326
|
-
# 1.
|
327
|
-
if clean_name.lower().endswith('
|
328
|
-
#
|
329
|
-
|
330
|
-
return self._camel_to_pascal_case(
|
641
|
+
# 1. 处理复数形式
|
642
|
+
if clean_name.lower().endswith('s') and len(clean_name) > 2:
|
643
|
+
# contacts -> Contact, phones -> Phone
|
644
|
+
singular = clean_name[:-1]
|
645
|
+
return self._camel_to_pascal_case(singular)
|
646
|
+
|
647
|
+
# 2. 处理常见后缀
|
648
|
+
elif clean_name.lower().endswith('profile'):
|
649
|
+
# businessProfile -> BusinessProfile,保持原样
|
650
|
+
return type_name
|
331
651
|
elif clean_name.lower().endswith('info'):
|
332
652
|
# spamInfo -> SpamInfo,保持原样
|
333
653
|
return type_name
|
334
|
-
elif clean_name.lower().endswith('stats'):
|
335
|
-
# commentsStats -> CommentsStats,保持原样
|
336
|
-
return type_name
|
337
654
|
elif clean_name.lower().endswith('data'):
|
338
|
-
#
|
655
|
+
# userData -> UserData,保持原样
|
656
|
+
return type_name
|
657
|
+
elif clean_name.lower().endswith('config'):
|
658
|
+
# systemConfig -> SystemConfig,保持原样
|
339
659
|
return type_name
|
340
|
-
|
341
|
-
|
342
|
-
# 1. 简单的Id类型:id -> Id
|
343
|
-
# 2. 数据类型:id -> IdData
|
344
|
-
# 3. 具体的Id类型:contactId -> ContactIdData
|
345
|
-
# 由于无法确定具体类型,保持基础推断,让依赖发现来解决
|
346
|
-
return type_name + 'Data'
|
660
|
+
|
661
|
+
# 3. 默认处理
|
347
662
|
else:
|
348
|
-
# 默认:直接转换为PascalCase
|
349
663
|
return type_name
|
350
664
|
|
665
|
+
def _is_likely_basic_field(self, field_name: str) -> bool:
|
666
|
+
"""
|
667
|
+
检查字段名是否可能是基础类型字段
|
668
|
+
|
669
|
+
Args:
|
670
|
+
field_name: 清理后的字段名
|
671
|
+
|
672
|
+
Returns:
|
673
|
+
是否可能是基础类型
|
674
|
+
"""
|
675
|
+
# 常见的基础字段模式
|
676
|
+
basic_patterns = [
|
677
|
+
'tags', # 标签数组
|
678
|
+
'ids', # ID数组
|
679
|
+
'values', # 值数组
|
680
|
+
'names', # 名称数组
|
681
|
+
'urls', # URL数组
|
682
|
+
'emails', # 邮箱数组
|
683
|
+
'phones', # 电话号码数组(如果是字符串)
|
684
|
+
'addresses', # 地址数组(如果是字符串)
|
685
|
+
'keywords', # 关键词数组
|
686
|
+
'categories', # 分类数组
|
687
|
+
'labels', # 标签数组
|
688
|
+
]
|
689
|
+
|
690
|
+
field_lower = field_name.lower()
|
691
|
+
|
692
|
+
# 检查是否匹配基础模式
|
693
|
+
for pattern in basic_patterns:
|
694
|
+
if field_lower == pattern or field_lower.endswith(pattern):
|
695
|
+
return True
|
696
|
+
|
697
|
+
return False
|
698
|
+
|
699
|
+
def _get_basic_field_proto_type(self, field_name: str) -> str:
|
700
|
+
"""
|
701
|
+
获取基础字段的protobuf类型
|
702
|
+
|
703
|
+
Args:
|
704
|
+
field_name: 字段名
|
705
|
+
|
706
|
+
Returns:
|
707
|
+
protobuf基础类型
|
708
|
+
"""
|
709
|
+
field_lower = field_name.lower()
|
710
|
+
|
711
|
+
# 根据字段名推断基础类型
|
712
|
+
if field_lower in ['tags', 'names', 'urls', 'emails', 'keywords', 'categories', 'labels']:
|
713
|
+
return 'string' # repeated string
|
714
|
+
elif field_lower in ['ids', 'values'] and 'id' in field_lower:
|
715
|
+
return 'int64' # repeated int64
|
716
|
+
elif field_lower in ['counts', 'numbers', 'amounts']:
|
717
|
+
return 'int32' # repeated int32
|
718
|
+
else:
|
719
|
+
return 'string' # 默认为string
|
720
|
+
|
351
721
|
def _camel_to_pascal_case(self, camel_str: str) -> str:
|
352
722
|
"""
|
353
723
|
将camelCase转换为PascalCase
|
@@ -407,20 +777,6 @@ class InfoDecoder:
|
|
407
777
|
else:
|
408
778
|
return type_name
|
409
779
|
|
410
|
-
def _determine_field_rule(self, field_type_byte: int) -> str:
|
411
|
-
"""
|
412
|
-
根据字节码确定字段规则
|
413
|
-
|
414
|
-
Args:
|
415
|
-
field_type_byte: 字段类型字节
|
416
|
-
|
417
|
-
Returns:
|
418
|
-
字段规则:'optional' 或 'repeated'
|
419
|
-
"""
|
420
|
-
# repeated类型的字节码
|
421
|
-
repeated_types = {27, 39, 44, 538} # repeated_message, repeated_int32, packed_enum, repeated_string
|
422
|
-
return 'repeated' if field_type_byte in repeated_types else 'optional'
|
423
|
-
|
424
780
|
def _is_type_reference(self, obj: str) -> bool:
|
425
781
|
"""
|
426
782
|
判断对象是否是类型引用
|
@@ -646,36 +1002,8 @@ class InfoDecoder:
|
|
646
1002
|
Returns:
|
647
1003
|
对应的proto类型
|
648
1004
|
"""
|
649
|
-
#
|
650
|
-
|
651
|
-
'boolean': 'bool',
|
652
|
-
'byte': 'int32',
|
653
|
-
'short': 'int32',
|
654
|
-
'int': 'int32',
|
655
|
-
'long': 'int64',
|
656
|
-
'float': 'float',
|
657
|
-
'double': 'double',
|
658
|
-
'String': 'string',
|
659
|
-
'ByteString': 'bytes',
|
660
|
-
}
|
661
|
-
|
662
|
-
# 直接映射
|
663
|
-
if java_type in type_mapping:
|
664
|
-
return type_mapping[java_type]
|
665
|
-
|
666
|
-
# 处理复杂类型
|
667
|
-
if java_type.startswith('MapFieldLite<'):
|
668
|
-
return 'map'
|
669
|
-
elif java_type.startswith('Internal.ProtobufList<') or java_type.startswith('List<'):
|
670
|
-
return 'message' # repeated message
|
671
|
-
elif java_type.endswith('[]'):
|
672
|
-
return 'message' # repeated
|
673
|
-
elif '.' in java_type and java_type.split('.')[-1][0].isupper():
|
674
|
-
# 看起来像是类名,可能是message或enum
|
675
|
-
return 'message' # 默认为message,具体类型由其他逻辑确定
|
676
|
-
|
677
|
-
# 默认返回string
|
678
|
-
return 'string'
|
1005
|
+
# 使用内部的类型转换方法
|
1006
|
+
return self._convert_java_to_proto_type(java_type)
|
679
1007
|
|
680
1008
|
def _analyze_unknown_type_by_wire_type(self, wire_type: int, objects: List[str], object_index: int, field_type_byte: int) -> str:
|
681
1009
|
"""
|
@@ -823,4 +1151,83 @@ class InfoDecoder:
|
|
823
1151
|
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', camel_str)
|
824
1152
|
# 处理小写字母后跟大写字母:userId -> user_Id
|
825
1153
|
s2 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1)
|
826
|
-
return s2.lower()
|
1154
|
+
return s2.lower()
|
1155
|
+
|
1156
|
+
def _is_internal_field(self, field_name_raw: str) -> bool:
|
1157
|
+
"""
|
1158
|
+
判断是否为protobuf内部字段(不是实际的proto字段)
|
1159
|
+
|
1160
|
+
Args:
|
1161
|
+
field_name_raw: 原始字段名
|
1162
|
+
|
1163
|
+
Returns:
|
1164
|
+
True如果是内部字段,False如果是实际字段
|
1165
|
+
"""
|
1166
|
+
# 移除末尾的下划线进行判断
|
1167
|
+
clean_name = field_name_raw.rstrip('_').lower()
|
1168
|
+
|
1169
|
+
# protobuf内部字段模式
|
1170
|
+
internal_patterns = [
|
1171
|
+
'bitfield0', # bitField0_ - 用于标记optional字段的位掩码
|
1172
|
+
'bitfield1', # bitField1_ - 多个位掩码字段
|
1173
|
+
'bitfield2', # bitField2_
|
1174
|
+
'bitfield', # 通用位字段模式
|
1175
|
+
'memoizedhashcode', # memoizedHashCode_ - 缓存的hash值
|
1176
|
+
'memoizedsize', # memoizedSize_ - 缓存的大小
|
1177
|
+
'unknownfields' # unknownFields_ - 未知字段存储
|
1178
|
+
]
|
1179
|
+
|
1180
|
+
# 检查是否匹配内部字段模式
|
1181
|
+
for pattern in internal_patterns:
|
1182
|
+
if clean_name == pattern or clean_name.startswith(pattern):
|
1183
|
+
return True
|
1184
|
+
|
1185
|
+
return False
|
1186
|
+
|
1187
|
+
def _clean_field_name(self, field_name_raw: str) -> str:
|
1188
|
+
"""
|
1189
|
+
清理字段名并转换为snake_case格式
|
1190
|
+
|
1191
|
+
Args:
|
1192
|
+
field_name_raw: 原始字段名
|
1193
|
+
|
1194
|
+
Returns:
|
1195
|
+
清理后的字段名
|
1196
|
+
"""
|
1197
|
+
return self._to_snake_case(field_name_raw.rstrip('_'))
|
1198
|
+
|
1199
|
+
def _parse_generic_types(self, type_params: str) -> List[str]:
|
1200
|
+
"""
|
1201
|
+
解析泛型类型参数
|
1202
|
+
|
1203
|
+
Args:
|
1204
|
+
type_params: 泛型参数字符串,如 "String, Contact" 或 "Map<String, Object>, List<Item>"
|
1205
|
+
|
1206
|
+
Returns:
|
1207
|
+
解析后的类型列表
|
1208
|
+
"""
|
1209
|
+
if not type_params:
|
1210
|
+
return []
|
1211
|
+
|
1212
|
+
result = []
|
1213
|
+
current = ""
|
1214
|
+
bracket_count = 0
|
1215
|
+
|
1216
|
+
for char in type_params:
|
1217
|
+
if char == '<':
|
1218
|
+
bracket_count += 1
|
1219
|
+
current += char
|
1220
|
+
elif char == '>':
|
1221
|
+
bracket_count -= 1
|
1222
|
+
current += char
|
1223
|
+
elif char == ',' and bracket_count == 0:
|
1224
|
+
# 只有在最外层的逗号才作为分隔符
|
1225
|
+
result.append(current.strip())
|
1226
|
+
current = ""
|
1227
|
+
else:
|
1228
|
+
current += char
|
1229
|
+
|
1230
|
+
if current.strip():
|
1231
|
+
result.append(current.strip())
|
1232
|
+
|
1233
|
+
return result
|