reproto 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. .git/COMMIT_EDITMSG +1 -1
  2. .git/index +0 -0
  3. .git/logs/HEAD +3 -0
  4. .git/logs/refs/heads/iyue +3 -0
  5. .git/logs/refs/remotes/gitlab/iyue +3 -0
  6. .git/logs/refs/remotes/origin/iyue +3 -0
  7. .git/objects/15/eb3f02479e633439ec83c143e703f8448043a1 +0 -0
  8. .git/objects/20/cf56ec106bcd66420dd000279f983571b918b6 +0 -0
  9. .git/objects/21/55b64d52922c88527c102d62f23e5c2abbae79 +0 -0
  10. .git/objects/26/1f67f3b731b32f6d77de9dd7be2d61e2a14ace +0 -0
  11. .git/objects/2e/2c1c42f5ac5d665cc672d3792078b756d9ab0e +0 -0
  12. .git/objects/33/52dfa8f5d9eb46cc98ea7ccecf02e4d9df95f7 +0 -0
  13. .git/objects/35/8bace20b731ff1bbb256d2a0158dfc84720978 +0 -0
  14. .git/objects/3c/6f0120229cc2cd8123efbeb7f186eb0a485f29 +0 -0
  15. .git/objects/4d/6d457bfabc4af842e5ddc2d56eb059d5dfdc9d +0 -0
  16. .git/objects/55/6723fdd4f525eed41c52fa80defca3f0c81c47 +0 -0
  17. .git/objects/65/a4f0ada7519f8b1e6a7c7e287541b8effde9fd +0 -0
  18. .git/objects/76/311aa8e59d780763e0d66787067cc5d9613a67 +0 -0
  19. .git/objects/8c/809c42c7ae13007fd885ee7bcffae7acf2c520 +0 -0
  20. .git/objects/8d/44142ae2d6dbb59d4ebed8587bccd051e5766b +0 -0
  21. .git/objects/8d/4a5767bef0c342f1660526f9671c0944922c40 +0 -0
  22. .git/objects/95/295a15779ebefd563ec777c3d3cced7e8d0209 +0 -0
  23. .git/objects/97/56fe0931216a7c40cbf250e1ab8a6dfd589f13 +0 -0
  24. .git/objects/9a/e313cdf64cd82416c1238eb493e6396f799f12 +0 -0
  25. .git/objects/cd/2d6c229438c6b1c694b9392a85888d89ef49c1 +0 -0
  26. .git/objects/db/beedb30613f79ae3ff67df1428cf8ade223711 +0 -0
  27. .git/objects/e8/1433b6ad92206cdadbee1f474b4f99383314cb +0 -0
  28. .git/objects/e9/a15996cb55ac72aeb6611d26e8d22246589943 +0 -0
  29. .git/objects/f7/25a430eb3364460ba854dbc8809edc21dc6c70 +0 -0
  30. .git/objects/fc/e15b9dbffd9f37b1f2d46944ee2d0394df6565 +2 -0
  31. .git/refs/heads/iyue +1 -1
  32. .git/refs/remotes/gitlab/iyue +1 -1
  33. .git/refs/remotes/origin/iyue +1 -1
  34. README.md +36 -116
  35. core/info_decoder.py +512 -105
  36. core/reconstructor.py +645 -84
  37. generation/proto_generator.py +38 -12
  38. main.py +36 -5
  39. parsing/java_parser.py +81 -1
  40. pyproject.toml +13 -2
  41. {reproto-0.0.6.dist-info → reproto-0.0.8.dist-info}/METADATA +46 -119
  42. {reproto-0.0.6.dist-info → reproto-0.0.8.dist-info}/RECORD +46 -20
  43. utils/file_cache.py +165 -0
  44. utils/type_index.py +341 -0
  45. {reproto-0.0.6.dist-info → reproto-0.0.8.dist-info}/WHEEL +0 -0
  46. {reproto-0.0.6.dist-info → reproto-0.0.8.dist-info}/entry_points.txt +0 -0
core/reconstructor.py CHANGED
@@ -31,6 +31,9 @@ class JavaSourceAnalyzer:
31
31
  self._current_class_name = None
32
32
  # 初始化JavaParser用于字段类型解析
33
33
  self.java_parser = JavaParser()
34
+ # 使用文件缓存系统优化I/O性能
35
+ from utils.file_cache import get_file_cache
36
+ self.file_cache = get_file_cache()
34
37
 
35
38
  def set_current_class(self, class_name: str):
36
39
  """设置当前分析的类"""
@@ -65,7 +68,7 @@ class JavaSourceAnalyzer:
65
68
  从Java源码中获取字段的真实类型
66
69
 
67
70
  Args:
68
- field_name_raw: 原始字段名(如 id_
71
+ field_name_raw: 原始字段名(如 contacts_
69
72
  expected_type: 期望的基础类型(message、enum 或 map)
70
73
 
71
74
  Returns:
@@ -77,40 +80,74 @@ class JavaSourceAnalyzer:
77
80
  # 清理字段名
78
81
  field_name = field_name_raw.rstrip('_')
79
82
 
80
- # 对于map类型,特殊处理MapFieldLite声明
81
- if expected_type == 'map':
82
- map_type = self._get_map_type_from_field(field_name)
83
- if map_type:
84
- return map_type
85
-
86
- # 对于枚举类型,优先从setter方法中获取类型
87
- if expected_type == 'enum':
88
- setter_type = self._get_type_from_setter(field_name)
89
- if setter_type:
90
- return setter_type
91
-
92
- # 查找字段声明模式:private SomeType fieldName_;
93
- pattern = rf'private\s+(\w+)\s+{re.escape(field_name)}_\s*;'
94
- matches = re.findall(pattern, self._current_class_content)
95
-
96
- if matches:
97
- simple_type = matches[0]
98
-
99
- # 如果字段声明是基础类型(如int),但期望类型是enum,跳过
100
- if expected_type == 'enum' and simple_type in ['int', 'long', 'short', 'byte']:
101
- return None
102
-
103
- # 查找import语句获取完整类名
104
- import_pattern = rf'import\s+([^;]*\.{re.escape(simple_type)});'
105
- import_matches = re.findall(import_pattern, self._current_class_content)
106
-
107
- if import_matches:
108
- return import_matches[0] # 返回完整的包名.类名
109
- else:
110
- # 如果没有import,假设在同一个包中
111
- if self._current_class_name:
112
- package_name = '.'.join(self._current_class_name.split('.')[:-1])
113
- return f"{package_name}.{simple_type}"
83
+ # 查找字段声明模式,支持多种声明格式
84
+ patterns = [
85
+ # Internal.ProtobufList<Contact> contacts_ = ...
86
+ rf'private\s+Internal\.ProtobufList<([^>]+)>\s+{re.escape(field_name)}_\s*=',
87
+ # MapFieldLite<String, Contact> contacts_ = ...
88
+ rf'private\s+MapFieldLite<([^,]+),\s*([^>]+)>\s+{re.escape(field_name)}_\s*=',
89
+ # List<Contact> contacts_ = ...
90
+ rf'private\s+List<([^>]+)>\s+{re.escape(field_name)}_\s*=',
91
+ # Internal.IntList badges_ = ... (用于枚举列表)
92
+ rf'private\s+(Internal\.IntList)\s+{re.escape(field_name)}_\s*=',
93
+ # 普通字段声明: private Contact contact_ = ...
94
+ rf'private\s+(\w+(?:\.\w+)*)\s+{re.escape(field_name)}_\s*=',
95
+ # 简单字段声明: private Contact contact_;
96
+ rf'private\s+(\w+(?:\.\w+)*)\s+{re.escape(field_name)}_\s*;'
97
+ ]
98
+
99
+ for i, pattern in enumerate(patterns):
100
+ matches = re.findall(pattern, self._current_class_content)
101
+ if matches:
102
+ if i == 0: # Internal.ProtobufList<Contact>
103
+ element_type = matches[0]
104
+ return f"Internal.ProtobufList<{element_type}>"
105
+ elif i == 1: # MapFieldLite<String, Contact>
106
+ key_type, value_type = matches[0]
107
+ return f"MapFieldLite<{key_type.strip()}, {value_type.strip()}>"
108
+ elif i == 2: # List<Contact>
109
+ element_type = matches[0]
110
+ return f"List<{element_type}>"
111
+ elif i == 3: # Internal.IntList
112
+ return "Internal.IntList"
113
+ else: # 普通类型
114
+ simple_type = matches[0]
115
+
116
+ # 检查是否为Java基础类型,如果是则直接返回
117
+ basic_java_types = {
118
+ 'int', 'long', 'float', 'double', 'boolean', 'byte', 'short', 'char',
119
+ 'String', 'Object', 'Integer', 'Long', 'Float', 'Double', 'Boolean',
120
+ 'Byte', 'Short', 'Character'
121
+ }
122
+
123
+ if simple_type in basic_java_types:
124
+ return simple_type # 直接返回基础类型,不添加包名
125
+
126
+ # 如果字段声明是基础类型(如int),但期望类型是enum,尝试从setter方法获取真实类型
127
+ if expected_type == 'enum' and simple_type in ['int', 'long', 'short', 'byte']:
128
+ setter_type = self._get_type_from_setter(field_name)
129
+ if setter_type:
130
+ return setter_type
131
+ continue
132
+
133
+ # 特殊处理:Internal.IntList可能对应枚举列表
134
+ if simple_type == 'Internal.IntList':
135
+ # 检查是否有对应的枚举setter方法
136
+ enum_type = self._get_enum_type_from_list_setter(field_name)
137
+ if enum_type:
138
+ return f"Internal.ProtobufList<{enum_type}>"
139
+
140
+ # 查找import语句获取完整类名
141
+ import_pattern = rf'import\s+([^;]*\.{re.escape(simple_type)});'
142
+ import_matches = re.findall(import_pattern, self._current_class_content)
143
+
144
+ if import_matches:
145
+ return import_matches[0] # 返回完整的包名.类名
146
+ else:
147
+ # 如果没有import,假设在同一个包中
148
+ if self._current_class_name:
149
+ package_name = '.'.join(self._current_class_name.split('.')[:-1])
150
+ return f"{package_name}.{simple_type}"
114
151
 
115
152
  return None
116
153
 
@@ -151,24 +188,38 @@ class JavaSourceAnalyzer:
151
188
  Returns:
152
189
  protobuf类型名
153
190
  """
191
+ if not java_type:
192
+ return 'string'
193
+
154
194
  # 基础类型映射
155
195
  basic_types = {
196
+ 'int': 'int32',
197
+ 'long': 'int64',
198
+ 'float': 'float',
199
+ 'double': 'double',
200
+ 'boolean': 'bool',
156
201
  'String': 'string',
157
- 'Integer': 'int32',
158
- 'Long': 'int64',
159
- 'Boolean': 'bool',
160
- 'Float': 'float',
161
- 'Double': 'double',
162
- 'ByteString': 'bytes'
202
+ 'java.lang.String': 'string',
203
+ 'java.lang.Integer': 'int32',
204
+ 'java.lang.Long': 'int64',
205
+ 'java.lang.Float': 'float',
206
+ 'java.lang.Double': 'double',
207
+ 'java.lang.Boolean': 'bool',
208
+ 'byte[]': 'bytes',
209
+ 'ByteString': 'bytes',
210
+ 'com.google.protobuf.ByteString': 'bytes',
163
211
  }
164
212
 
213
+ # 检查是否为基础类型
165
214
  if java_type in basic_types:
166
215
  return basic_types[java_type]
167
216
 
168
- # 对于其他类型,去掉包名,只保留类名
217
+ # 如果是完整的类名,提取简单类名
169
218
  if '.' in java_type:
170
- return java_type.split('.')[-1]
219
+ simple_name = java_type.split('.')[-1]
220
+ return simple_name
171
221
 
222
+ # 默认返回原类型名
172
223
  return java_type
173
224
 
174
225
  def _get_type_from_setter(self, field_name: str) -> Optional[str]:
@@ -184,8 +235,49 @@ class JavaSourceAnalyzer:
184
235
  # 将字段名转换为setter方法名
185
236
  setter_name = f"set{field_name[0].upper()}{field_name[1:]}"
186
237
 
187
- # 查找setter方法:public void setSpamType(SpamType spamType)
188
- pattern = rf'public\s+void\s+{re.escape(setter_name)}\s*\(\s*(\w+)\s+\w+\s*\)'
238
+ # 查找私有setter方法:/* JADX INFO: Access modifiers changed from: private */
239
+ # public void setSpamType(SpamType spamType)
240
+ patterns = [
241
+ # 查找setter方法签名,支持public或private
242
+ rf'(?:public|private)\s+void\s+{re.escape(setter_name)}\s*\(\s*(\w+)\s+\w+\s*\)',
243
+ # 也支持注释中的private标记
244
+ rf'\/\*[^*]*private[^*]*\*\/\s*(?:public|private)\s+void\s+{re.escape(setter_name)}\s*\(\s*(\w+)\s+\w+\s*\)'
245
+ ]
246
+
247
+ for pattern in patterns:
248
+ matches = re.findall(pattern, self._current_class_content, re.DOTALL)
249
+ if matches:
250
+ simple_type = matches[0]
251
+
252
+ # 查找import语句获取完整类名
253
+ import_pattern = rf'import\s+([^;]*\.{re.escape(simple_type)});'
254
+ import_matches = re.findall(import_pattern, self._current_class_content)
255
+
256
+ if import_matches:
257
+ return import_matches[0]
258
+ else:
259
+ # 如果没有import,假设在同一个包中
260
+ if self._current_class_name:
261
+ package_name = '.'.join(self._current_class_name.split('.')[:-1])
262
+ return f"{package_name}.{simple_type}"
263
+
264
+ return None
265
+
266
+ def _get_enum_type_from_list_setter(self, field_name: str) -> Optional[str]:
267
+ """
268
+ 从列表setter方法中获取枚举类型(如setBadges(int i10, Badge badge))
269
+
270
+ Args:
271
+ field_name: 字段名(如 badges)
272
+
273
+ Returns:
274
+ 枚举类型名
275
+ """
276
+ # 将字段名转换为setter方法名
277
+ setter_name = f"set{field_name[0].upper()}{field_name[1:]}"
278
+
279
+ # 查找列表setter方法:setBadges(int i10, Badge badge)
280
+ pattern = rf'(?:public|private)\s+void\s+{re.escape(setter_name)}\s*\(\s*int\s+\w+,\s*(\w+)\s+\w+\s*\)'
189
281
  matches = re.findall(pattern, self._current_class_content)
190
282
 
191
283
  if matches:
@@ -206,19 +298,21 @@ class JavaSourceAnalyzer:
206
298
  return None
207
299
 
208
300
  def _load_class_content(self, class_name: str) -> Optional[str]:
209
- """加载类的源码内容"""
301
+ """加载类的源码内容(使用缓存优化)"""
210
302
  try:
211
303
  # 标准路径:com.example.Model -> com/example/Model.java
212
304
  file_path = class_name.replace('.', '/') + '.java'
213
305
  full_path = self.sources_dir / file_path
214
306
 
215
- if full_path.exists():
216
- return full_path.read_text(encoding='utf-8')
307
+ # 使用缓存系统获取文件内容
308
+ content = self.file_cache.get_content(full_path)
309
+ if content:
310
+ return content
217
311
 
218
312
  # 备选方案:按简单类名搜索
219
313
  simple_name = class_name.split('.')[-1]
220
314
  for java_file in self.sources_dir.rglob(f"{simple_name}.java"):
221
- return java_file.read_text(encoding='utf-8')
315
+ return self.file_cache.get_content(java_file)
222
316
 
223
317
  return None
224
318
  except Exception:
@@ -249,38 +343,64 @@ class ProtoReconstructor:
249
343
 
250
344
  # 初始化核心组件
251
345
  self.java_parser = JavaParser() # Java文件解析器
252
- # 创建Java源码分析器并传递给InfoDecoder
253
- self.java_source_analyzer = JavaSourceAnalyzer(sources_dir)
254
- self.info_decoder = InfoDecoder(self.java_source_analyzer) # 字节码解码器
346
+ self.enum_parser = EnumParser(str(sources_dir)) # 枚举解析器需要字符串路径
347
+ self.info_decoder = InfoDecoder()
255
348
  self.proto_generator = ProtoGenerator() # Proto文件生成器
256
349
 
350
+ # 初始化Java源码分析器
351
+ self.java_source_analyzer = JavaSourceAnalyzer(sources_dir)
352
+ self.info_decoder.java_source_analyzer = self.java_source_analyzer
353
+
354
+ # 初始化类型索引(延迟加载)
355
+ from utils.type_index import get_type_index
356
+ self.type_index = get_type_index(sources_dir)
357
+
257
358
  # 任务调度状态
258
359
  self.processed_classes: Set[str] = set() # 已处理的类
259
360
  self.pending_classes: deque = deque() # 待处理的类队列
260
361
  self.message_definitions: Dict[str, MessageDefinition] = {} # 消息定义
261
362
  self.enum_definitions: Dict[str, EnumDefinition] = {} # 枚举定义
262
363
 
364
+ # 错误和状态跟踪
365
+ self.failed_classes: Dict[str, str] = {} # 失败的类 -> 失败原因
366
+ self.skipped_classes: Dict[str, str] = {} # 跳过的类 -> 跳过原因
367
+
368
+ # 当前处理的类名(用于调试)
369
+ self._current_processing_class = None
370
+
263
371
  def reconstruct_from_root(self, root_class: str) -> Dict[str, any]:
264
372
  """
265
- 从根类开始重构所有相关的proto文件
373
+ 从根类开始重构protobuf定义
266
374
 
267
375
  Args:
268
- root_class: 根类的完整类名,如 'com.example.Model'
376
+ root_class: 根类的完整名称
269
377
 
270
378
  Returns:
271
- 重构结果字典
379
+ 包含统计信息的字典
272
380
  """
273
- self.logger.info(f"开始重构,根类: {root_class}")
381
+ self.logger.info(f"🚀 开始重构,根类: {root_class}")
274
382
 
275
- # 启动任务队列
383
+ # 1. 添加根类到处理队列
276
384
  self.pending_classes.append(root_class)
277
385
 
278
- # 广度优先处理所有依赖类
386
+ # 2. 处理所有消息类
279
387
  self._process_all_classes()
280
388
 
281
- # 生成最终的proto文件
389
+ # 3. 解析所有枚举类
390
+ self._process_all_enums()
391
+
392
+ # 4. 生成proto文件
282
393
  self._generate_all_proto_files()
283
394
 
395
+ # 5. 输出性能统计信息
396
+ from utils.file_cache import get_file_cache
397
+ file_cache = get_file_cache()
398
+ file_cache.print_stats()
399
+
400
+ # 输出类型索引统计
401
+ self.type_index.print_stats()
402
+
403
+ # 6. 返回统计信息
284
404
  # 报告未知类型统计
285
405
  self._report_unknown_types()
286
406
 
@@ -304,6 +424,35 @@ class ProtoReconstructor:
304
424
  self.logger.info(f"处理类: {class_name}")
305
425
  self._process_single_class(class_name)
306
426
 
427
+ def _process_all_enums(self) -> None:
428
+ """解析目标包下的所有枚举类"""
429
+ self.logger.info("🔢 开始解析枚举类...")
430
+
431
+ # 从已处理的类中推断目标包名
432
+ target_package = None
433
+ if self.message_definitions:
434
+ # 取第一个消息定义的包名
435
+ first_message = next(iter(self.message_definitions.values()))
436
+ target_package = first_message.package_name
437
+ elif self.processed_classes:
438
+ # 从已处理的类名中推断包名
439
+ first_class = next(iter(self.processed_classes))
440
+ target_package = '.'.join(first_class.split('.')[:-1])
441
+
442
+ if not target_package:
443
+ self.logger.warning("⚠️ 无法推断目标包名,跳过枚举解析")
444
+ return
445
+
446
+ # 解析目标包下的所有枚举
447
+ enum_definitions = self.enum_parser.parse_all_enums(target_package)
448
+
449
+ # 存储枚举定义
450
+ for enum_def in enum_definitions:
451
+ self.enum_definitions[enum_def.full_name] = enum_def
452
+ self.logger.info(f" ✅ 解析枚举: {enum_def.name} ({len(enum_def.values)} 个值)")
453
+
454
+ self.logger.info(f"📊 枚举解析完成,共解析 {len(enum_definitions)} 个枚举")
455
+
307
456
  def _process_single_class(self, class_name: str) -> None:
308
457
  """
309
458
  处理单个Java类
@@ -317,10 +466,19 @@ class ProtoReconstructor:
317
466
  self.java_source_analyzer.set_current_class(class_name)
318
467
 
319
468
  try:
469
+ # 检查是否应该跳过这个类
470
+ if self._should_skip_class(class_name):
471
+ skip_reason = self._get_skip_reason(class_name)
472
+ self.skipped_classes[class_name] = skip_reason
473
+ self.logger.info(f" ⏭️ 跳过类: {class_name} ({skip_reason})")
474
+ return
475
+
320
476
  # 1. 查找Java文件
321
477
  java_file_path = self._find_java_file(class_name)
322
478
  if not java_file_path:
323
- self.logger.info(f" ⚠️ 找不到Java文件: {class_name}")
479
+ error_msg = "找不到对应的Java文件"
480
+ self.failed_classes[class_name] = error_msg
481
+ self.logger.warning(f" ❌ {error_msg}: {class_name}")
324
482
  return
325
483
 
326
484
  # 2. 尝试解析为枚举
@@ -335,12 +493,14 @@ class ProtoReconstructor:
335
493
  # 3. 尝试解析为消息类
336
494
  info_string, objects_array = self.java_parser.parse_java_file(java_file_path)
337
495
  if not info_string:
338
- self.logger.info(f" ⚠️ 无法解析Java文件: {class_name}")
496
+ error_msg = "无法从Java文件中提取protobuf信息"
497
+ self.failed_classes[class_name] = error_msg
498
+ self.logger.warning(f" ❌ {error_msg}: {class_name}")
339
499
  return
340
500
 
341
501
  # 4. 解码字节码为消息定义
342
502
  message_def = self.info_decoder.decode_message_info(
343
- class_name, info_string, objects_array
503
+ class_name, info_string, objects_array, java_file_path
344
504
  )
345
505
 
346
506
  if message_def:
@@ -350,10 +510,16 @@ class ProtoReconstructor:
350
510
  # 5. 发现并添加依赖类到队列
351
511
  self._discover_dependencies(message_def)
352
512
  else:
353
- self.logger.info(f" ❌ 解码失败: {class_name}")
513
+ error_msg = "字节码解码失败,可能不是protobuf消息类"
514
+ self.failed_classes[class_name] = error_msg
515
+ self.logger.warning(f" ❌ {error_msg}: {class_name}")
354
516
 
355
517
  except Exception as e:
356
- self.logger.error(f"处理异常: {class_name} - {e}")
518
+ error_msg = f"处理异常: {str(e)}"
519
+ self.failed_classes[class_name] = error_msg
520
+ self.logger.error(f" ❌ {error_msg}: {class_name}")
521
+ if hasattr(self, '_verbose') and self._verbose:
522
+ self.logger.exception(f"详细异常信息 ({class_name}):")
357
523
  finally:
358
524
  # 无论成功失败都标记为已处理,避免无限循环
359
525
  self.processed_classes.add(class_name)
@@ -387,19 +553,202 @@ class ProtoReconstructor:
387
553
 
388
554
  # 从常规字段提取依赖
389
555
  for field in message_def.fields:
390
- dep = self._resolve_field_dependency(field.type_name, message_def.package_name)
391
- if dep:
392
- dependencies.append(dep)
556
+ deps = self._extract_field_dependencies(field.type_name, message_def.package_name)
557
+ dependencies.extend(deps)
393
558
 
394
559
  # 从oneof字段提取依赖
395
560
  for oneof in message_def.oneofs:
396
561
  for field in oneof.fields:
397
- dep = self._resolve_field_dependency(field.type_name, message_def.package_name)
398
- if dep:
399
- dependencies.append(dep)
562
+ deps = self._extract_field_dependencies(field.type_name, message_def.package_name)
563
+ dependencies.extend(deps)
564
+
565
+ # 去重
566
+ return list(set(dependencies))
567
+
568
+ def _extract_field_dependencies(self, type_name: str, current_package: str) -> List[str]:
569
+ """
570
+ 从字段类型中提取所有依赖(包括map类型的键值类型)
571
+
572
+ Args:
573
+ type_name: 字段类型名
574
+ current_package: 当前包名
575
+
576
+ Returns:
577
+ 依赖类名列表
578
+ """
579
+ dependencies = []
580
+
581
+ if not type_name:
582
+ return dependencies
583
+
584
+ # 处理map类型: map<string, Contact> -> [Contact]
585
+ if type_name.startswith('map<') and type_name.endswith('>'):
586
+ map_content = type_name[4:-1] # 移除 'map<' 和 '>'
587
+ # 分割键值类型,处理嵌套的尖括号
588
+ key_type, value_type = self._parse_map_types(map_content)
589
+
590
+ # 递归处理键类型和值类型
591
+ dependencies.extend(self._extract_field_dependencies(key_type, current_package))
592
+ dependencies.extend(self._extract_field_dependencies(value_type, current_package))
593
+
594
+ # 处理普通类型
595
+ else:
596
+ dep = self._resolve_field_dependency(type_name, current_package)
597
+ if dep:
598
+ dependencies.append(dep)
400
599
 
401
600
  return dependencies
402
601
 
602
+ def _parse_map_types(self, map_content: str) -> tuple:
603
+ """
604
+ 解析map类型的键值类型
605
+
606
+ Args:
607
+ map_content: map内容,如 "string, Contact" 或 "string, List<Contact>"
608
+
609
+ Returns:
610
+ (key_type, value_type) 元组
611
+ """
612
+ # 简单情况:没有嵌套的尖括号
613
+ if '<' not in map_content:
614
+ parts = [part.strip() for part in map_content.split(',', 1)]
615
+ if len(parts) == 2:
616
+ return parts[0], parts[1]
617
+
618
+ # 复杂情况:处理嵌套的尖括号
619
+ bracket_count = 0
620
+ for i, char in enumerate(map_content):
621
+ if char == '<':
622
+ bracket_count += 1
623
+ elif char == '>':
624
+ bracket_count -= 1
625
+ elif char == ',' and bracket_count == 0:
626
+ # 找到分隔符
627
+ key_type = map_content[:i].strip()
628
+ value_type = map_content[i+1:].strip()
629
+ return key_type, value_type
630
+
631
+ # 如果解析失败,返回默认值
632
+ return 'string', 'string'
633
+
634
+ def _should_skip_class(self, class_name: str) -> bool:
635
+ """
636
+ 判断是否应该跳过某个类
637
+
638
+ Args:
639
+ class_name: 类名
640
+
641
+ Returns:
642
+ 是否应该跳过
643
+ """
644
+ # 跳过已经处理过的类
645
+ if class_name in self.processed_classes:
646
+ return True
647
+
648
+ # 跳过基础类型(包括Java基础类型和常见的系统类型)
649
+ basic_types = {
650
+ # Java基础类型
651
+ 'int', 'long', 'float', 'double', 'boolean', 'byte', 'short', 'char',
652
+ 'String', 'Object', 'Integer', 'Long', 'Float', 'Double', 'Boolean',
653
+ 'Byte', 'Short', 'Character',
654
+ # Java系统类型
655
+ 'java.lang.String', 'java.lang.Integer', 'java.lang.Long',
656
+ 'java.lang.Boolean', 'java.lang.Float', 'java.lang.Double',
657
+ 'java.lang.Object', 'java.util.List', 'java.util.Map',
658
+ 'com.google.protobuf.ByteString', 'com.google.protobuf.MessageLite'
659
+ }
660
+
661
+ if class_name in basic_types:
662
+ return True
663
+
664
+ # 跳过明显的系统类型和内部类型
665
+ if self._is_system_or_internal_type(class_name):
666
+ return True
667
+
668
+ return False
669
+
670
+ def _is_system_or_internal_type(self, class_name: str) -> bool:
671
+ """
672
+ 判断是否为系统类型或内部类型
673
+
674
+ Args:
675
+ class_name: 类名
676
+
677
+ Returns:
678
+ 是否为系统或内部类型
679
+ """
680
+ # 跳过明显不是protobuf类的包
681
+ skip_packages = [
682
+ 'java.', 'javax.', 'android.', 'androidx.',
683
+ 'kotlin.', 'kotlinx.', 'com.google.common.',
684
+ 'org.apache.', 'org.junit.', 'junit.',
685
+ 'com.unity3d.', # 添加Unity3D包,避免误匹配
686
+ 'Internal.' # 跳过Internal包下的类型
687
+ ]
688
+
689
+ for skip_pkg in skip_packages:
690
+ if class_name.startswith(skip_pkg):
691
+ return True
692
+
693
+ # 跳过明显的内部类型
694
+ internal_patterns = [
695
+ 'Internal.ProtobufList',
696
+ 'MapFieldLite',
697
+ 'GeneratedMessageLite',
698
+ 'MessageLiteOrBuilder'
699
+ ]
700
+
701
+ for pattern in internal_patterns:
702
+ if pattern in class_name:
703
+ return True
704
+
705
+ return False
706
+
707
+ def _get_skip_reason(self, class_name: str) -> str:
708
+ """
709
+ 获取跳过类的原因
710
+
711
+ Args:
712
+ class_name: 类名
713
+
714
+ Returns:
715
+ 跳过原因
716
+ """
717
+ # 基础类型
718
+ basic_types = {
719
+ 'java.lang.String', 'java.lang.Integer', 'java.lang.Long',
720
+ 'java.lang.Boolean', 'java.lang.Float', 'java.lang.Double',
721
+ 'java.lang.Object', 'java.util.List', 'java.util.Map',
722
+ 'com.google.protobuf.ByteString', 'com.google.protobuf.MessageLite'
723
+ }
724
+
725
+ if class_name in basic_types:
726
+ return "基础类型"
727
+
728
+ # 已处理
729
+ if class_name in self.processed_classes:
730
+ return "已处理"
731
+
732
+ # 系统包
733
+ system_packages = {
734
+ 'java.': 'Java系统包',
735
+ 'javax.': 'Java扩展包',
736
+ 'android.': 'Android系统包',
737
+ 'androidx.': 'AndroidX包',
738
+ 'kotlin.': 'Kotlin标准库',
739
+ 'kotlinx.': 'Kotlin扩展库',
740
+ 'com.google.common.': 'Google通用库',
741
+ 'org.apache.': 'Apache库',
742
+ 'org.junit.': 'JUnit测试库',
743
+ 'junit.': 'JUnit库'
744
+ }
745
+
746
+ for prefix, reason in system_packages.items():
747
+ if class_name.startswith(prefix):
748
+ return reason
749
+
750
+ return "未知原因"
751
+
403
752
  def _resolve_field_dependency(self, type_name: str, current_package: str) -> Optional[str]:
404
753
  """
405
754
  解析字段类型名为完整的类名
@@ -414,9 +763,13 @@ class ProtoReconstructor:
414
763
  if not type_name:
415
764
  return None
416
765
 
417
- # 跳过基础类型
418
- basic_types = {'string', 'int32', 'int64', 'bool', 'float', 'double', 'bytes', 'message', 'enum'}
419
- if type_name in basic_types:
766
+ # 检查是否为基础类型
767
+ basic_proto_types = {
768
+ 'string', 'int32', 'int64', 'uint32', 'uint64', 'sint32', 'sint64',
769
+ 'fixed32', 'fixed64', 'sfixed32', 'sfixed64', 'bool', 'float', 'double', 'bytes'
770
+ }
771
+
772
+ if type_name in basic_proto_types:
420
773
  return None
421
774
 
422
775
  # 如果已经是完整类名,直接返回
@@ -431,7 +784,13 @@ class ProtoReconstructor:
431
784
  # 如果推断失败,尝试查找所有可能的匹配类
432
785
  # 需要传递当前类名以便进行源码分析
433
786
  current_class = getattr(self, '_current_processing_class', None)
434
- return self._find_best_matching_class(type_name, current_package, current_class)
787
+ best_match = self._find_best_matching_class(type_name, current_package, current_class)
788
+
789
+ # 如果找到匹配,验证该类是否确实存在
790
+ if best_match and self._find_java_file(best_match):
791
+ return best_match
792
+
793
+ return None
435
794
 
436
795
  def _find_java_file(self, class_name: str) -> Optional[Path]:
437
796
  """
@@ -450,10 +809,31 @@ class ProtoReconstructor:
450
809
  if full_path.exists():
451
810
  return full_path
452
811
 
812
+ # 处理内部类:支持多层嵌套
813
+ # com.example.Models$Inner$Deep -> com/example/Models$Inner$Deep.java
814
+ if '$' in class_name:
815
+ # 找到最后一个.的位置,分离包名和类名部分
816
+ last_dot_index = class_name.rfind('.')
817
+ if last_dot_index != -1:
818
+ package_path = class_name[:last_dot_index].replace('.', '/') # 包路径
819
+ class_part = class_name[last_dot_index + 1:] # 类名部分(可能包含多个$)
820
+ inner_class_file_path = f"{package_path}/{class_part}.java"
821
+ inner_class_full_path = self.sources_dir / inner_class_file_path
822
+
823
+ if inner_class_full_path.exists():
824
+ return inner_class_full_path
825
+
453
826
  # 备选方案:按简单类名搜索
454
827
  simple_name = class_name.split('.')[-1]
455
- for java_file in self.sources_dir.rglob(f"{simple_name}.java"):
456
- return java_file
828
+ # 对于内部类,简单名称可能包含多个$符号
829
+ if '$' in simple_name:
830
+ # 对于内部类,直接使用包含$的完整文件名搜索
831
+ for java_file in self.sources_dir.rglob(f"{simple_name}.java"):
832
+ return java_file
833
+ else:
834
+ # 对于普通类,使用原来的逻辑
835
+ for java_file in self.sources_dir.rglob(f"{simple_name}.java"):
836
+ return java_file
457
837
 
458
838
  return None
459
839
 
@@ -503,7 +883,8 @@ class ProtoReconstructor:
503
883
  if len(package_parts) > 1:
504
884
  parent = '.'.join(package_parts[:-1])
505
885
  # 常见的同级包名
506
- common_siblings = ['models', 'model', 'types', 'entities', 'data', 'proto', 'protobuf']
886
+ common_siblings = ['models', 'model', 'types', 'entities', 'data', 'proto', 'protobuf',
887
+ 'enums', 'enum', 'common', 'shared', 'core', 'base']
507
888
  for sibling in common_siblings:
508
889
  if sibling != package_parts[-1]: # 避免重复
509
890
  candidates.append(f"{parent}.{sibling}")
@@ -511,10 +892,32 @@ class ProtoReconstructor:
511
892
  # 4. 根包下的常见子包
512
893
  if len(package_parts) > 2:
513
894
  root_package = '.'.join(package_parts[:2]) # 如 com.example
514
- common_subpackages = ['models', 'model', 'types', 'entities', 'common', 'shared', 'proto']
895
+ common_subpackages = ['models', 'model', 'types', 'entities', 'common', 'shared', 'proto',
896
+ 'enums', 'enum', 'core', 'base', 'data', 'dto', 'vo']
515
897
  for subpkg in common_subpackages:
516
898
  candidates.append(f"{root_package}.{subpkg}")
517
899
 
900
+ # 5. 深度搜索:在当前包的各级父包下寻找常见子包
901
+ for i in range(len(package_parts) - 1, 1, -1):
902
+ parent_package = '.'.join(package_parts[:i])
903
+ # 在每个父包下寻找常见的子包
904
+ search_patterns = ['models', 'enums', 'types', 'common', 'shared', 'core']
905
+ for pattern in search_patterns:
906
+ candidates.append(f"{parent_package}.{pattern}")
907
+ # 也尝试更深一层的组合
908
+ if i > 2:
909
+ candidates.append(f"{parent_package}.{pattern}.{package_parts[-1]}")
910
+
911
+ # 6. 特殊情况:如果当前是v1包,也尝试其他版本
912
+ if 'v1' in package_parts:
913
+ for i, part in enumerate(package_parts):
914
+ if part == 'v1':
915
+ # 尝试v2, v3等
916
+ for version in ['v2', 'v3', 'v4']:
917
+ version_package = package_parts.copy()
918
+ version_package[i] = version
919
+ candidates.append('.'.join(version_package))
920
+
518
921
  # 去重并保持顺序
519
922
  seen = set()
520
923
  unique_candidates = []
@@ -527,7 +930,7 @@ class ProtoReconstructor:
527
930
 
528
931
  def _find_best_matching_class(self, type_name: str, current_package: str, current_class: str = None) -> Optional[str]:
529
932
  """
530
- 查找最佳匹配的类(用于处理推断失败的情况)
933
+ 查找最佳匹配的类(使用索引优化)
531
934
 
532
935
  Args:
533
936
  type_name: 类型名(如 IdData)
@@ -544,7 +947,33 @@ class ProtoReconstructor:
544
947
  self.logger.info(f" 🔍 源码分析: {type_name} -> {actual_type}")
545
948
  return actual_type
546
949
 
547
- # 如果源码分析失败,回退到模糊匹配
950
+ # 预检查:如果是基础字段名,可能不需要创建单独的类
951
+ if self._is_basic_field_type(type_name, current_class):
952
+ self.logger.info(f" 🔍 基础字段类型检测: {type_name} -> 跳过类匹配")
953
+ return None
954
+
955
+ # 使用类型索引进行快速匹配
956
+ best_match = self.type_index.find_best_match(type_name, current_package)
957
+
958
+ if best_match:
959
+ self.logger.info(f" 🔍 索引匹配: {type_name} -> {best_match}")
960
+ return best_match
961
+
962
+ # 索引未找到匹配,回退到传统方法(保留兼容性)
963
+ self.logger.debug(f" ⚠️ 索引未找到匹配,回退到目录扫描: {type_name}")
964
+ return self._fallback_directory_search(type_name, current_package)
965
+
966
+ def _fallback_directory_search(self, type_name: str, current_package: str) -> Optional[str]:
967
+ """
968
+ 回退的目录扫描方法(当索引匹配失败时使用)
969
+
970
+ Args:
971
+ type_name: 类型名
972
+ current_package: 当前包名
973
+
974
+ Returns:
975
+ 匹配的类名或None
976
+ """
548
977
  matching_classes = []
549
978
 
550
979
  # 在源码目录中搜索
@@ -557,7 +986,11 @@ class ProtoReconstructor:
557
986
  if package_parts:
558
987
  package_name = '.'.join(package_parts)
559
988
  full_class_name = f"{package_name}.{file_name}"
560
- matching_classes.append((full_class_name, self._calculate_package_similarity(package_name, current_package)))
989
+
990
+ # 添加包名过滤,避免匹配到无关的第三方库
991
+ if self._is_valid_package_for_matching(package_name, current_package):
992
+ similarity = self._calculate_package_similarity(package_name, current_package)
993
+ matching_classes.append((full_class_name, similarity))
561
994
 
562
995
  if not matching_classes:
563
996
  return None
@@ -566,9 +999,117 @@ class ProtoReconstructor:
566
999
  matching_classes.sort(key=lambda x: x[1], reverse=True)
567
1000
  best_match = matching_classes[0][0]
568
1001
 
569
- self.logger.info(f" 🔍 智能匹配: {type_name} -> {best_match}")
1002
+ self.logger.info(f" 🔍 目录扫描匹配: {type_name} -> {best_match}")
570
1003
  return best_match
571
1004
 
1005
+ def _is_basic_field_type(self, type_name: str, current_class: str = None) -> bool:
1006
+ """
1007
+ 检查是否为基础字段类型,避免为简单字段创建不必要的类
1008
+
1009
+ Args:
1010
+ type_name: 类型名
1011
+ current_class: 当前类名
1012
+
1013
+ Returns:
1014
+ 是否为基础字段类型
1015
+ """
1016
+ # 首先检查是否为Java基础类型
1017
+ basic_java_types = {
1018
+ 'int', 'long', 'float', 'double', 'boolean', 'byte', 'short', 'char',
1019
+ 'String', 'Object', 'Integer', 'Long', 'Float', 'Double', 'Boolean',
1020
+ 'Byte', 'Short', 'Character'
1021
+ }
1022
+
1023
+ if type_name in basic_java_types:
1024
+ return True
1025
+
1026
+ # 常见的基础字段名模式
1027
+ basic_patterns = [
1028
+ 'tags', # tags_ 字段通常是 repeated string
1029
+ 'ids', # ids_ 字段通常是 repeated string 或 repeated int64
1030
+ 'values', # values_ 字段通常是基础类型数组
1031
+ 'names', # names_ 字段通常是 repeated string
1032
+ 'urls', # urls_ 字段通常是 repeated string
1033
+ 'emails', # emails_ 字段通常是 repeated string
1034
+ ]
1035
+
1036
+ type_lower = type_name.lower()
1037
+
1038
+ # 检查是否匹配基础模式
1039
+ if type_lower in basic_patterns:
1040
+ return True
1041
+
1042
+ # 如果有当前类,尝试从Java源码中验证
1043
+ if current_class:
1044
+ try:
1045
+ java_file = self._find_java_file(current_class)
1046
+ if java_file:
1047
+ content = java_file.read_text(encoding='utf-8')
1048
+
1049
+ # 查找对应的字段声明,检查是否为基础类型
1050
+ field_name_pattern = type_lower.rstrip('s') + 's?_' # tags -> tags?_
1051
+ import re
1052
+
1053
+ # 查找字段声明:private List<String> tags_; 或 private Internal.ProtobufList<String> tags_;
1054
+ patterns = [
1055
+ rf'private\s+(?:Internal\.)?ProtobufList<String>\s+{field_name_pattern}',
1056
+ rf'private\s+List<String>\s+{field_name_pattern}',
1057
+ rf'private\s+(?:Internal\.)?ProtobufList<Integer>\s+{field_name_pattern}',
1058
+ rf'private\s+List<Integer>\s+{field_name_pattern}',
1059
+ rf'private\s+(?:Internal\.)?ProtobufList<Long>\s+{field_name_pattern}',
1060
+ rf'private\s+List<Long>\s+{field_name_pattern}',
1061
+ ]
1062
+
1063
+ for pattern in patterns:
1064
+ if re.search(pattern, content, re.IGNORECASE):
1065
+ return True
1066
+
1067
+ except Exception as e:
1068
+ self.logger.debug(f" 检查基础字段类型时出错: {e}")
1069
+
1070
+ return False
1071
+
1072
+ def _is_valid_package_for_matching(self, candidate_package: str, current_package: str) -> bool:
1073
+ """
1074
+ 检查候选包名是否适合用于匹配
1075
+
1076
+ Args:
1077
+ candidate_package: 候选包名
1078
+ current_package: 当前包名
1079
+
1080
+ Returns:
1081
+ 是否为有效的匹配候选
1082
+ """
1083
+ # 获取当前包的根包名(通常是前两部分,如 com.truecaller)
1084
+ current_parts = current_package.split('.')
1085
+ if len(current_parts) >= 2:
1086
+ current_root = '.'.join(current_parts[:2])
1087
+ else:
1088
+ current_root = current_package
1089
+
1090
+ # 过滤规则
1091
+ filters = [
1092
+ # 1. 排除明显的第三方库
1093
+ lambda pkg: 'unity3d' not in pkg.lower(),
1094
+ lambda pkg: 'facebook' not in pkg.lower(),
1095
+ lambda pkg: 'google' not in pkg.lower() or pkg.startswith(current_root),
1096
+ lambda pkg: 'android' not in pkg.lower() or pkg.startswith(current_root),
1097
+ lambda pkg: 'androidx' not in pkg.lower(),
1098
+ lambda pkg: 'kotlin' not in pkg.lower(),
1099
+ lambda pkg: 'java' not in pkg.lower(),
1100
+ lambda pkg: 'javax' not in pkg.lower(),
1101
+
1102
+ # 2. 优先选择同根包的类
1103
+ lambda pkg: pkg.startswith(current_root) or self._calculate_package_similarity(pkg, current_package) > 0.3
1104
+ ]
1105
+
1106
+ # 应用所有过滤规则
1107
+ for filter_func in filters:
1108
+ if not filter_func(candidate_package):
1109
+ return False
1110
+
1111
+ return True
1112
+
572
1113
  def _calculate_package_similarity(self, package1: str, package2: str) -> float:
573
1114
  """
574
1115
  计算两个包名的相似度
@@ -606,13 +1147,28 @@ class ProtoReconstructor:
606
1147
  Returns:
607
1148
  实际的完整类型名
608
1149
  """
1150
+ # 首先检查是否为基础类型,如果是则直接跳过
1151
+ basic_types = {
1152
+ 'int', 'long', 'float', 'double', 'boolean', 'byte', 'short', 'char',
1153
+ 'String', 'Object', 'Integer', 'Long', 'Float', 'Double', 'Boolean',
1154
+ 'Byte', 'Short', 'Character'
1155
+ }
1156
+
1157
+ if inferred_type in basic_types:
1158
+ self.logger.debug(f" 跳过基础类型: {inferred_type}")
1159
+ return None
1160
+
609
1161
  try:
610
1162
  java_file = self._find_java_file(class_name)
611
1163
  if not java_file:
612
1164
  return None
613
1165
 
614
- # 读取Java源码
615
- content = java_file.read_text(encoding='utf-8')
1166
+ # 使用缓存读取Java源码
1167
+ from utils.file_cache import get_file_cache
1168
+ file_cache = get_file_cache()
1169
+ content = file_cache.get_content(java_file)
1170
+ if not content:
1171
+ return None
616
1172
 
617
1173
  # 查找字段声明模式:private SomeType fieldName_;
618
1174
  # 我们要找的是以inferred_type结尾的类型声明
@@ -626,6 +1182,11 @@ class ProtoReconstructor:
626
1182
  # 取第一个匹配的类型
627
1183
  actual_type_simple = matches[0]
628
1184
 
1185
+ # 再次检查匹配的类型是否为基础类型
1186
+ if actual_type_simple in basic_types:
1187
+ self.logger.debug(f" 匹配到基础类型,跳过: {actual_type_simple}")
1188
+ return None
1189
+
629
1190
  # 检查是否有import语句
630
1191
  import_pattern = rf'import\s+([^;]*\.{re.escape(actual_type_simple)});'
631
1192
  import_matches = re.findall(import_pattern, content)