maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,543 @@
1
+ """
2
+ 风险关键词匹配器 - 从长文本中高效提取包含风险关键词的关键段落
3
+
4
+ 使用方法:
5
+ from maque.nlp.risk_matcher import RiskMatcher
6
+
7
+ matcher = RiskMatcher.from_json("risk_keywords.json")
8
+ results = matcher.extract_segments("你的长文本...")
9
+
10
+ for seg in results:
11
+ print(f"[{seg['risk_level']}] {seg['keyword']} -> {seg['segment']}")
12
+ """
13
+
14
+ import json
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+
18
+ import ahocorasick
19
+
20
+ from .sentence_splitter import SentenceSplitter
21
+
22
+
23
+ @dataclass
24
+ class MatchResult:
25
+ """匹配结果"""
26
+ keyword: str # 匹配到的关键词
27
+ risk_level: str # 风险级别: high/medium/combo
28
+ category: str # 关键词类别
29
+ start: int # 关键词在原文中的起始位置
30
+ end: int # 关键词在原文中的结束位置
31
+ segment: str # 提取的关键段落
32
+ segment_start: int # 段落在原文中的起始位置
33
+ segment_end: int # 段落在原文中的结束位置
34
+
35
+
36
+ @dataclass
37
+ class ExtractResult:
38
+ """提取结果汇总"""
39
+ text: str # 原始文本
40
+ matches: list[MatchResult] = field(default_factory=list) # 所有匹配
41
+ segments: list[str] = field(default_factory=list) # 去重后的段落列表
42
+ risk_level: str = "none" # 整体风险级别
43
+ hit_high: bool = False # 是否命中高风险词
44
+ hit_medium_count: int = 0 # 命中中风险词数量
45
+ hit_combo: bool = False # 是否命中组合规则
46
+
47
+ def to_dict(self) -> dict:
48
+ return {
49
+ "risk_level": self.risk_level,
50
+ "hit_high": self.hit_high,
51
+ "hit_medium_count": self.hit_medium_count,
52
+ "hit_combo": self.hit_combo,
53
+ "segments": self.segments,
54
+ "matches": [
55
+ {
56
+ "keyword": m.keyword,
57
+ "risk_level": m.risk_level,
58
+ "category": m.category,
59
+ "segment": m.segment,
60
+ }
61
+ for m in self.matches
62
+ ],
63
+ }
64
+
65
+
66
+ class RiskMatcher:
67
+ """风险关键词匹配器(基于 Aho-Corasick 算法)"""
68
+
69
+ def __init__(
70
+ self,
71
+ high_risk: list[str] | dict[str, list[str]] = None,
72
+ medium_risk: list[str] | dict[str, list[str]] = None,
73
+ combo_rules: dict[str, dict] = None,
74
+ context_chars: int = 150,
75
+ ):
76
+ """
77
+ Args:
78
+ high_risk: 高风险词列表或 {类别: [关键词列表]}
79
+ medium_risk: 中风险词列表或 {类别: [关键词列表]}
80
+ combo_rules: 组合规则
81
+ context_chars: 关键词前后提取的字符数
82
+ """
83
+ self.combo_rules = combo_rules or {}
84
+ self.context_chars = context_chars
85
+ self._low_risk_words: set[str] = set() # 低风险词(需组合才触发)
86
+
87
+ # 句子切分器
88
+ self._splitter = SentenceSplitter(max_length=300)
89
+
90
+ # 构建关键词到(级别, 类别)的映射
91
+ self._keyword_map: dict[str, tuple[str, str]] = {}
92
+
93
+ # 支持列表或字典格式
94
+ if isinstance(high_risk, list):
95
+ for kw in high_risk:
96
+ self._keyword_map[kw.lower()] = ("high", "高风险")
97
+ elif isinstance(high_risk, dict):
98
+ for category, keywords in high_risk.items():
99
+ if category == "说明" or not isinstance(keywords, list):
100
+ continue
101
+ for kw in keywords:
102
+ self._keyword_map[kw.lower()] = ("high", category)
103
+
104
+ if isinstance(medium_risk, list):
105
+ for kw in medium_risk:
106
+ self._keyword_map[kw.lower()] = ("medium", "中风险")
107
+ elif isinstance(medium_risk, dict):
108
+ for category, keywords in medium_risk.items():
109
+ if category == "说明" or not isinstance(keywords, list):
110
+ continue
111
+ for kw in keywords:
112
+ self._keyword_map[kw.lower()] = ("medium", category)
113
+
114
+ # 构建 Aho-Corasick 自动机
115
+ self._automaton = None
116
+ if self._keyword_map:
117
+ self._build_automaton()
118
+
119
+ def _build_automaton(self):
120
+ """构建 Aho-Corasick 自动机"""
121
+ self._automaton = ahocorasick.Automaton()
122
+ for keyword in self._keyword_map:
123
+ self._automaton.add_word(keyword, keyword)
124
+ self._automaton.make_automaton()
125
+
126
+ @classmethod
127
+ def from_json(cls, json_path: str, **kwargs) -> "RiskMatcher":
128
+ """从 JSON 文件加载配置"""
129
+ with open(json_path, "r", encoding="utf-8") as f:
130
+ config = json.load(f)
131
+
132
+ # 合并高风险词、风险句式、GPT变体
133
+ high_risk = config.get("高风险词", [])
134
+ risk_phrases = config.get("风险句式", [])
135
+ gpt_variants = config.get("高风险GPT变体", [])
136
+
137
+ if isinstance(high_risk, list):
138
+ if isinstance(risk_phrases, list):
139
+ high_risk = high_risk + risk_phrases
140
+ if isinstance(gpt_variants, list):
141
+ high_risk = high_risk + gpt_variants
142
+ elif isinstance(high_risk, dict):
143
+ if isinstance(risk_phrases, list):
144
+ high_risk["风险句式"] = risk_phrases
145
+ if isinstance(gpt_variants, list):
146
+ high_risk["GPT变体"] = gpt_variants
147
+
148
+ # 获取低风险词(需要组合才触发)
149
+ low_risk_config = config.get("低风险词_需组合", {})
150
+ low_risk_words = low_risk_config.get("词列表", []) if isinstance(low_risk_config, dict) else []
151
+
152
+ matcher = cls(
153
+ high_risk=high_risk,
154
+ medium_risk=config.get("中风险词", {}),
155
+ combo_rules=config.get("组合规则", {}),
156
+ **kwargs,
157
+ )
158
+ # 存储低风险词列表
159
+ matcher._low_risk_words = set(kw.lower() for kw in low_risk_words)
160
+ return matcher
161
+
162
+ def _find_matches(self, text_lower: str) -> list[tuple[str, int, int]]:
163
+ """使用 Aho-Corasick 查找所有匹配"""
164
+ if not self._automaton:
165
+ return []
166
+ matches = []
167
+ for end_idx, keyword in self._automaton.iter(text_lower):
168
+ start_idx = end_idx - len(keyword) + 1
169
+ matches.append((keyword, start_idx, end_idx + 1))
170
+ return matches
171
+
172
+ def _split_sentences(self, text: str) -> list[tuple[str, int, int]]:
173
+ """切分句子,返回 [(句子, 起始位置, 结束位置)]"""
174
+ return [(s.text, s.start, s.end) for s in self._splitter.split(text)]
175
+
176
+ def _extract_segment(
177
+ self,
178
+ text: str,
179
+ start: int,
180
+ end: int,
181
+ sentences: list[tuple[str, int, int]] = None,
182
+ ) -> tuple[str, int, int]:
183
+ """提取包含关键词的完整句子"""
184
+ # 如果没有传入句子列表,则切分(兼容旧调用)
185
+ if sentences is None:
186
+ sentences = self._split_sentences(text)
187
+
188
+ # 找到包含关键词的句子
189
+ matched_sentences = []
190
+ for sent, s_start, s_end in sentences:
191
+ # 关键词位置与句子有重叠
192
+ if s_start <= start < s_end or s_start < end <= s_end:
193
+ matched_sentences.append((sent, s_start, s_end))
194
+
195
+ if matched_sentences:
196
+ # 合并所有匹配的句子
197
+ seg_start = min(s[1] for s in matched_sentences)
198
+ seg_end = max(s[2] for s in matched_sentences)
199
+ segment = text[seg_start:seg_end].strip()
200
+ return segment, seg_start, seg_end
201
+
202
+ # fallback: 使用固定窗口
203
+ seg_start = max(0, start - self.context_chars)
204
+ seg_end = min(len(text), end + self.context_chars)
205
+ segment = text[seg_start:seg_end].strip()
206
+ return segment, seg_start, seg_end
207
+
208
+ def _check_combo_rules(self, text_lower: str) -> list[tuple[str, str, str]]:
209
+ """检查组合规则,返回 [(规则名, A组词, B组词)]"""
210
+ hits = []
211
+ for rule_name, rule in self.combo_rules.items():
212
+ if rule_name == "说明":
213
+ continue
214
+
215
+ # 支持多种字段名格式
216
+ a_keywords = (
217
+ rule.get("A组") or
218
+ rule.get("A组_角色扮演") or
219
+ rule.get("A组_指令词") or
220
+ []
221
+ )
222
+ b_keywords = (
223
+ rule.get("B组") or
224
+ rule.get("B组_规避词") or
225
+ rule.get("B组_强制词") or
226
+ []
227
+ )
228
+
229
+ a_hit = None
230
+ b_hit = None
231
+ for kw in a_keywords:
232
+ if kw.lower() in text_lower:
233
+ a_hit = kw
234
+ break
235
+ for kw in b_keywords:
236
+ if kw.lower() in text_lower:
237
+ b_hit = kw
238
+ break
239
+
240
+ if a_hit and b_hit:
241
+ hits.append((rule_name, a_hit, b_hit))
242
+
243
+ return hits
244
+
245
+ def _merge_overlapping_segments(
246
+ self,
247
+ matches: list[MatchResult],
248
+ merge_gap: int = 50,
249
+ min_length: int = 20,
250
+ max_length: int = 500,
251
+ ) -> list[str]:
252
+ """合并重叠或相邻的段落,并过滤长度"""
253
+ if not matches:
254
+ return []
255
+
256
+ # 按段落位置排序
257
+ sorted_matches = sorted(matches, key=lambda m: m.segment_start)
258
+
259
+ merged = []
260
+ current_start = sorted_matches[0].segment_start
261
+ current_end = sorted_matches[0].segment_end
262
+ current_text = sorted_matches[0].segment
263
+
264
+ for m in sorted_matches[1:]:
265
+ if m.segment_start <= current_end + merge_gap:
266
+ # 重叠或相邻,合并
267
+ current_end = max(current_end, m.segment_end)
268
+ else:
269
+ # 不重叠,保存当前段落
270
+ merged.append(current_text)
271
+ current_start = m.segment_start
272
+ current_end = m.segment_end
273
+ current_text = m.segment
274
+
275
+ merged.append(current_text)
276
+
277
+ # 过滤长度
278
+ filtered = []
279
+ for seg in merged:
280
+ if len(seg) < min_length:
281
+ continue
282
+ if len(seg) > max_length:
283
+ seg = seg[:max_length] + "..."
284
+ filtered.append(seg)
285
+
286
+ return filtered
287
+
288
+ def _find_sentence_boundary_start(self, text: str, pos: int, max_search: int = 100) -> int:
289
+ """从 pos 向前找句子开始位置(上一个句子结束符之后)"""
290
+ if pos <= 0:
291
+ return 0
292
+
293
+ terminators = '.?!。?!\n'
294
+ search_start = max(0, pos - max_search)
295
+ for i in range(pos - 1, search_start - 1, -1):
296
+ if text[i] in terminators:
297
+ boundary = i + 1
298
+ while boundary < pos and text[boundary] in ' \t\n\r':
299
+ boundary += 1
300
+ return boundary
301
+
302
+ return search_start if search_start > 0 else pos
303
+
304
+ def _find_sentence_boundary_end(self, text: str, pos: int, max_search: int = 100) -> int:
305
+ """从 pos 向后找句子结束位置"""
306
+ text_len = len(text)
307
+ if pos >= text_len:
308
+ return text_len
309
+
310
+ terminators = '.?!。?!\n'
311
+ search_end = min(text_len, pos + max_search)
312
+ for i in range(pos, search_end):
313
+ if text[i] in terminators:
314
+ return i + 1
315
+
316
+ return search_end if search_end < text_len else pos
317
+
318
+ def get_expanded_segments(
319
+ self,
320
+ text: str,
321
+ matches: list[MatchResult],
322
+ context_chars: int = 200,
323
+ merge_gap: int = 50,
324
+ min_length: int = 150,
325
+ max_length: int = 1000,
326
+ snap_to_sentence: bool = True,
327
+ ) -> list[str]:
328
+ """
329
+ 对匹配结果扩展上下文并合并重叠区域,生成适合向量检索的段落
330
+
331
+ Args:
332
+ text: 原始文本
333
+ matches: 匹配结果列表
334
+ context_chars: 每个 segment 前后扩展的字符数
335
+ merge_gap: 合并间距阈值
336
+ min_length: 最小段落长度
337
+ max_length: 最大段落长度
338
+ snap_to_sentence: 是否对齐到句子边界
339
+ """
340
+ if not matches:
341
+ return []
342
+
343
+ # 1. 计算每个 match 的扩展区域
344
+ regions = []
345
+ for m in matches:
346
+ start = max(0, m.segment_start - context_chars)
347
+ end = min(len(text), m.segment_end + context_chars)
348
+
349
+ if snap_to_sentence:
350
+ start = self._find_sentence_boundary_start(text, start)
351
+ end = self._find_sentence_boundary_end(text, end)
352
+
353
+ regions.append((start, end))
354
+
355
+ # 2. 按起始位置排序并合并重叠区域
356
+ regions.sort()
357
+ merged = []
358
+ curr_start, curr_end = regions[0]
359
+
360
+ for start, end in regions[1:]:
361
+ if start <= curr_end + merge_gap:
362
+ curr_end = max(curr_end, end)
363
+ else:
364
+ merged.append((curr_start, curr_end))
365
+ curr_start, curr_end = start, end
366
+ merged.append((curr_start, curr_end))
367
+
368
+ # 3. 提取文本并过滤
369
+ segments = []
370
+ for start, end in merged:
371
+ seg = text[start:end].strip()
372
+ if len(seg) >= min_length:
373
+ if len(seg) > max_length:
374
+ if snap_to_sentence:
375
+ cut_pos = self._find_sentence_boundary_end(seg, max_length - 50, max_search=50)
376
+ if cut_pos > min_length:
377
+ seg = seg[:cut_pos]
378
+ else:
379
+ seg = seg[:max_length] + "..."
380
+ else:
381
+ seg = seg[:max_length] + "..."
382
+ segments.append(seg)
383
+
384
+ return segments
385
+
386
+ def extract_segments(
387
+ self,
388
+ text: str,
389
+ medium_threshold: int = 2,
390
+ ) -> ExtractResult:
391
+ """
392
+ 从文本中提取风险段落
393
+
394
+ Args:
395
+ text: 输入文本
396
+ medium_threshold: 中风险词命中阈值
397
+
398
+ Returns:
399
+ ExtractResult 对象
400
+ """
401
+ result = ExtractResult(text=text)
402
+
403
+ if not text:
404
+ return result
405
+
406
+ text_lower = text.lower()
407
+
408
+ # 1. 查找关键词匹配
409
+ raw_matches = self._find_matches(text_lower)
410
+
411
+ # 2. 检查组合规则
412
+ combo_hits = self._check_combo_rules(text_lower)
413
+
414
+ # 如果没有任何匹配,直接返回
415
+ if not raw_matches and not combo_hits:
416
+ return result
417
+
418
+ # 3. 句子切分(只做一次)
419
+ sentences = self._split_sentences(text)
420
+
421
+ # 4. 构建 MatchResult
422
+ high_count = 0
423
+ medium_count = 0
424
+
425
+ for keyword, start, end in raw_matches:
426
+ level, category = self._keyword_map[keyword]
427
+ segment, seg_start, seg_end = self._extract_segment(text, start, end, sentences)
428
+
429
+ match = MatchResult(
430
+ keyword=keyword,
431
+ risk_level=level,
432
+ category=category,
433
+ start=start,
434
+ end=end,
435
+ segment=segment,
436
+ segment_start=seg_start,
437
+ segment_end=seg_end,
438
+ )
439
+ result.matches.append(match)
440
+
441
+ if level == "high":
442
+ high_count += 1
443
+ else:
444
+ medium_count += 1
445
+
446
+ # 5. 处理组合规则匹配
447
+ for rule_name, a_kw, b_kw in combo_hits:
448
+ a_idx = text_lower.find(a_kw.lower())
449
+ if a_idx >= 0:
450
+ segment, seg_start, seg_end = self._extract_segment(text, a_idx, a_idx + len(a_kw), sentences)
451
+ match = MatchResult(
452
+ keyword=f"{a_kw} + {b_kw}",
453
+ risk_level="combo",
454
+ category=rule_name,
455
+ start=a_idx,
456
+ end=a_idx + len(a_kw),
457
+ segment=segment,
458
+ segment_start=seg_start,
459
+ segment_end=seg_end,
460
+ )
461
+ result.matches.append(match)
462
+
463
+ # 6. 处理低风险词逻辑(需要配合其他风险词才触发)
464
+ has_real_risk = high_count > 0 or medium_count > 0 or len(combo_hits) > 0
465
+
466
+ low_risk_matches = [kw for kw in self._low_risk_words if kw in text_lower]
467
+
468
+ if low_risk_matches and has_real_risk:
469
+ for kw in low_risk_matches:
470
+ idx = text_lower.find(kw)
471
+ if idx >= 0:
472
+ segment, seg_start, seg_end = self._extract_segment(text, idx, idx + len(kw), sentences)
473
+ match = MatchResult(
474
+ keyword=kw,
475
+ risk_level="medium",
476
+ category="低风险词_已激活",
477
+ start=idx,
478
+ end=idx + len(kw),
479
+ segment=segment,
480
+ segment_start=seg_start,
481
+ segment_end=seg_end,
482
+ )
483
+ result.matches.append(match)
484
+ medium_count += 1
485
+
486
+ # 7. 判断整体风险级别
487
+ result.hit_high = high_count > 0
488
+ result.hit_medium_count = medium_count
489
+ result.hit_combo = len(combo_hits) > 0
490
+
491
+ if result.hit_high or result.hit_combo:
492
+ result.risk_level = "high"
493
+ elif result.hit_medium_count >= medium_threshold:
494
+ result.risk_level = "medium"
495
+ elif result.hit_medium_count > 0:
496
+ result.risk_level = "low"
497
+
498
+ # 8. 合并重叠段落
499
+ result.segments = self._merge_overlapping_segments(result.matches)
500
+
501
+ return result
502
+
503
+ def batch_extract(
504
+ self,
505
+ texts: list[str],
506
+ medium_threshold: int = 2,
507
+ only_risky: bool = True,
508
+ ) -> list[ExtractResult]:
509
+ """
510
+ 批量提取
511
+
512
+ Args:
513
+ texts: 文本列表
514
+ medium_threshold: 中风险词命中阈值
515
+ only_risky: 是否只返回有风险的结果
516
+ """
517
+ results = []
518
+ for text in texts:
519
+ r = self.extract_segments(text, medium_threshold)
520
+ if only_risky and r.risk_level == "none":
521
+ continue
522
+ results.append(r)
523
+ return results
524
+
525
+
526
+ def match_and_extract(
527
+ text: str,
528
+ keywords_json: str = None,
529
+ context_chars: int = 150,
530
+ ) -> ExtractResult:
531
+ """
532
+ 便捷函数:匹配并提取风险段落
533
+
534
+ Args:
535
+ text: 输入文本
536
+ keywords_json: 关键词库 JSON 文件路径
537
+ context_chars: 上下文字符数
538
+ """
539
+ if keywords_json is None:
540
+ keywords_json = Path(__file__).parent / "risk_keywords.json"
541
+
542
+ matcher = RiskMatcher.from_json(keywords_json, context_chars=context_chars)
543
+ return matcher.extract_segments(text)