novel-writer-cli 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +103 -0
- package/agents/chapter-writer.md +142 -0
- package/agents/character-weaver.md +117 -0
- package/agents/consistency-auditor.md +85 -0
- package/agents/plot-architect.md +128 -0
- package/agents/quality-judge.md +232 -0
- package/agents/style-analyzer.md +109 -0
- package/agents/style-refiner.md +97 -0
- package/agents/summarizer.md +128 -0
- package/agents/world-builder.md +161 -0
- package/dist/__tests__/character-voice.test.js +445 -0
- package/dist/__tests__/commit-prototype-pollution.test.js +45 -0
- package/dist/__tests__/engagement.test.js +382 -0
- package/dist/__tests__/foreshadow-visibility.test.js +131 -0
- package/dist/__tests__/hook-ledger.test.js +1028 -0
- package/dist/__tests__/naming-lint.test.js +132 -0
- package/dist/__tests__/narrative-health-injection.test.js +359 -0
- package/dist/__tests__/next-step-prejudge-guardrails.test.js +325 -0
- package/dist/__tests__/next-step-title-fix.test.js +153 -0
- package/dist/__tests__/platform-profile.test.js +274 -0
- package/dist/__tests__/promise-ledger.test.js +189 -0
- package/dist/__tests__/readability-lint.test.js +209 -0
- package/dist/__tests__/text-utils.test.js +39 -0
- package/dist/__tests__/title-policy.test.js +147 -0
- package/dist/advance.js +75 -0
- package/dist/character-voice.js +805 -0
- package/dist/checkpoint.js +126 -0
- package/dist/cli.js +563 -0
- package/dist/cliche-lint.js +515 -0
- package/dist/commit.js +1460 -0
- package/dist/consistency-auditor.js +684 -0
- package/dist/engagement.js +687 -0
- package/dist/errors.js +7 -0
- package/dist/fingerprint.js +16 -0
- package/dist/foreshadow-visibility.js +214 -0
- package/dist/fs-utils.js +68 -0
- package/dist/hook-ledger.js +721 -0
- package/dist/hook-policy.js +107 -0
- package/dist/instruction-gates.js +51 -0
- package/dist/instructions.js +406 -0
- package/dist/latest-summary-loader.js +29 -0
- package/dist/lock.js +121 -0
- package/dist/naming-lint.js +531 -0
- package/dist/ner.js +73 -0
- package/dist/next-step.js +408 -0
- package/dist/novel-ask.js +270 -0
- package/dist/output.js +9 -0
- package/dist/platform-constraints.js +518 -0
- package/dist/platform-profile.js +325 -0
- package/dist/prejudge-guardrails.js +370 -0
- package/dist/project.js +40 -0
- package/dist/promise-ledger.js +723 -0
- package/dist/readability-lint.js +555 -0
- package/dist/safe-parse.js +36 -0
- package/dist/safe-path.js +29 -0
- package/dist/scoring-weights.js +290 -0
- package/dist/steps.js +60 -0
- package/dist/text-utils.js +18 -0
- package/dist/title-policy.js +251 -0
- package/dist/type-guards.js +6 -0
- package/dist/validate.js +131 -0
- package/docs/user/README.md +17 -0
- package/docs/user/guardrails.md +179 -0
- package/docs/user/interactive-gates.md +124 -0
- package/docs/user/novel-cli.md +289 -0
- package/docs/user/ops.md +123 -0
- package/docs/user/quick-start.md +97 -0
- package/docs/user/spec-system.md +166 -0
- package/docs/user/storylines.md +144 -0
- package/package.json +48 -0
- package/schemas/README.md +18 -0
- package/schemas/character-voice-drift.schema.json +135 -0
- package/schemas/character-voice-profiles.schema.json +141 -0
- package/schemas/engagement-metrics.schema.json +38 -0
- package/schemas/hook-ledger.schema.json +108 -0
- package/schemas/platform-profile.schema.json +235 -0
- package/schemas/promise-ledger.schema.json +97 -0
- package/scripts/calibrate-quality-judge.sh +91 -0
- package/scripts/compare-regression-runs.sh +86 -0
- package/scripts/lib/_common.py +131 -0
- package/scripts/lib/calibrate_quality_judge.py +312 -0
- package/scripts/lib/compare_regression_runs.py +142 -0
- package/scripts/lib/run_regression.py +621 -0
- package/scripts/lint-blacklist.sh +201 -0
- package/scripts/lint-cliche.sh +370 -0
- package/scripts/lint-readability.sh +404 -0
- package/scripts/query-foreshadow.sh +252 -0
- package/scripts/run-ner.sh +669 -0
- package/scripts/run-regression.sh +122 -0
- package/skills/cli-step/SKILL.md +158 -0
- package/skills/continue/SKILL.md +348 -0
- package/skills/continue/references/context-contracts.md +169 -0
- package/skills/continue/references/continuity-checks.md +187 -0
- package/skills/continue/references/file-protocols.md +64 -0
- package/skills/continue/references/foreshadowing.md +130 -0
- package/skills/continue/references/gate-decision.md +53 -0
- package/skills/continue/references/periodic-maintenance.md +46 -0
- package/skills/novel-writing/SKILL.md +77 -0
- package/skills/novel-writing/references/quality-rubric.md +140 -0
- package/skills/novel-writing/references/style-guide.md +145 -0
- package/skills/start/SKILL.md +458 -0
- package/skills/start/references/quality-review.md +86 -0
- package/skills/start/references/setting-update.md +44 -0
- package/skills/start/references/vol-planning.md +61 -0
- package/skills/start/references/vol-review.md +58 -0
- package/skills/status/SKILL.md +116 -0
- package/skills/status/references/sample-output.md +60 -0
- package/templates/ai-blacklist.json +79 -0
- package/templates/brief-template.md +46 -0
- package/templates/genre-weight-profiles.json +90 -0
- package/templates/novel-ask/example.answer.json +12 -0
- package/templates/novel-ask/example.question.json +51 -0
- package/templates/platform-profile.json +148 -0
- package/templates/style-profile-template.json +58 -0
- package/templates/web-novel-cliche-lint.json +41 -0
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
#
|
|
3
|
+
# Deterministic-ish Chinese NER extractor (M3+ extension point).
|
|
4
|
+
#
|
|
5
|
+
# Usage:
|
|
6
|
+
# run-ner.sh <chapter.md>
|
|
7
|
+
#
|
|
8
|
+
# Output:
|
|
9
|
+
# stdout JSON (exit 0 on success)
|
|
10
|
+
#
|
|
11
|
+
# Exit codes:
|
|
12
|
+
# 0 = success (valid JSON emitted to stdout)
|
|
13
|
+
# 1 = validation failure (bad args, missing files)
|
|
14
|
+
# 2 = script exception (unexpected runtime error)
|
|
15
|
+
#
|
|
16
|
+
# Notes:
|
|
17
|
+
# - This script is designed to be fast and regression-friendly (stable output ordering).
|
|
18
|
+
# - It is NOT a perfect NER model. It emits candidates + evidence snippets for LLM verification.
|
|
19
|
+
|
|
20
|
+
set -euo pipefail
|
|
21
|
+
|
|
22
|
+
if [ "$#" -ne 1 ]; then
|
|
23
|
+
echo "Usage: run-ner.sh <chapter.md>" >&2
|
|
24
|
+
exit 1
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
chapter_path="$1"
|
|
28
|
+
|
|
29
|
+
if [ ! -f "$chapter_path" ]; then
|
|
30
|
+
echo "run-ner.sh: chapter file not found: $chapter_path" >&2
|
|
31
|
+
exit 1
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
if ! command -v python3 >/dev/null 2>&1; then
|
|
35
|
+
echo "run-ner.sh: python3 is required but not found" >&2
|
|
36
|
+
exit 2
|
|
37
|
+
fi
|
|
38
|
+
|
|
39
|
+
if ! python3 -c "import sys; sys.exit(0 if sys.version_info >= (3, 7) else 1)" 2>/dev/null; then
|
|
40
|
+
echo "run-ner.sh: python3 >= 3.7 is required" >&2
|
|
41
|
+
exit 2
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
python3 - "$chapter_path" <<'PY'
|
|
45
|
+
import json
|
|
46
|
+
import re
|
|
47
|
+
import sys
|
|
48
|
+
from dataclasses import dataclass, field
|
|
49
|
+
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _die(msg: str, exit_code: int) -> None:
|
|
53
|
+
sys.stderr.write(msg.rstrip() + "\n")
|
|
54
|
+
raise SystemExit(exit_code)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _truncate(s: str, limit: int = 160) -> str:
|
|
58
|
+
s = s.strip()
|
|
59
|
+
if len(s) <= limit:
|
|
60
|
+
return s
|
|
61
|
+
return s[: limit - 1] + "…"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _strip_markdown_lines(lines: Sequence[str]) -> List[Tuple[int, str]]:
|
|
65
|
+
"""
|
|
66
|
+
Best-effort strip of non-narrative markdown:
|
|
67
|
+
- code fences
|
|
68
|
+
- headings
|
|
69
|
+
- horizontal rules
|
|
70
|
+
"""
|
|
71
|
+
out: List[Tuple[int, str]] = []
|
|
72
|
+
in_fence = False
|
|
73
|
+
for idx, raw in enumerate(lines, start=1):
|
|
74
|
+
line = raw.rstrip("\n")
|
|
75
|
+
stripped = line.strip()
|
|
76
|
+
|
|
77
|
+
if stripped.startswith("```"):
|
|
78
|
+
in_fence = not in_fence
|
|
79
|
+
continue
|
|
80
|
+
if in_fence:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
if stripped.startswith("#"):
|
|
84
|
+
continue
|
|
85
|
+
if stripped in {"---", "___", "***"}:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
if not stripped:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
out.append((idx, line))
|
|
92
|
+
return out
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
LOCATION_SUFFIXES = [
|
|
96
|
+
"城",
|
|
97
|
+
"镇",
|
|
98
|
+
"村",
|
|
99
|
+
"山",
|
|
100
|
+
"岭",
|
|
101
|
+
"谷",
|
|
102
|
+
"林",
|
|
103
|
+
"森林",
|
|
104
|
+
"原",
|
|
105
|
+
"原野",
|
|
106
|
+
"宫",
|
|
107
|
+
"殿",
|
|
108
|
+
"府",
|
|
109
|
+
"楼",
|
|
110
|
+
"阁",
|
|
111
|
+
"寺",
|
|
112
|
+
"观",
|
|
113
|
+
"院",
|
|
114
|
+
"洞",
|
|
115
|
+
"湖",
|
|
116
|
+
"海",
|
|
117
|
+
"江",
|
|
118
|
+
"河",
|
|
119
|
+
"关",
|
|
120
|
+
"门",
|
|
121
|
+
"岛",
|
|
122
|
+
"州",
|
|
123
|
+
"国",
|
|
124
|
+
"郡",
|
|
125
|
+
"坊",
|
|
126
|
+
"街",
|
|
127
|
+
"巷",
|
|
128
|
+
"庄",
|
|
129
|
+
"堡",
|
|
130
|
+
"营",
|
|
131
|
+
"港",
|
|
132
|
+
"岸",
|
|
133
|
+
"崖",
|
|
134
|
+
"狱",
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
LOCATION_PREFIX_TRIGGERS = [
|
|
138
|
+
# prepositions / verbs that often prefix a location mention
|
|
139
|
+
"来到",
|
|
140
|
+
"到了",
|
|
141
|
+
"到达",
|
|
142
|
+
"进入",
|
|
143
|
+
"踏入",
|
|
144
|
+
"走进",
|
|
145
|
+
"走入",
|
|
146
|
+
"抵达",
|
|
147
|
+
"赶到",
|
|
148
|
+
"前往",
|
|
149
|
+
"奔向",
|
|
150
|
+
"穿过",
|
|
151
|
+
"越过",
|
|
152
|
+
"飞入",
|
|
153
|
+
"潜入",
|
|
154
|
+
"驶入",
|
|
155
|
+
"闯入",
|
|
156
|
+
"返回",
|
|
157
|
+
"回到",
|
|
158
|
+
"离开",
|
|
159
|
+
"在",
|
|
160
|
+
"于",
|
|
161
|
+
"往",
|
|
162
|
+
"向",
|
|
163
|
+
"朝",
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
SPEECH_VERBS = [
|
|
167
|
+
"说道",
|
|
168
|
+
"问道",
|
|
169
|
+
"答道",
|
|
170
|
+
"笑道",
|
|
171
|
+
"冷笑",
|
|
172
|
+
"喝道",
|
|
173
|
+
"低声",
|
|
174
|
+
"轻声",
|
|
175
|
+
"沉声",
|
|
176
|
+
"喃喃",
|
|
177
|
+
"叹道",
|
|
178
|
+
"怒道",
|
|
179
|
+
"喊道",
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
SPEECH_ENDINGS = [
|
|
183
|
+
"说道",
|
|
184
|
+
"问道",
|
|
185
|
+
"答道",
|
|
186
|
+
"笑道",
|
|
187
|
+
"喝道",
|
|
188
|
+
"叹道",
|
|
189
|
+
"怒道",
|
|
190
|
+
"喊道",
|
|
191
|
+
"道",
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
SPEECH_MODIFIERS = [
|
|
195
|
+
"低声",
|
|
196
|
+
"轻声",
|
|
197
|
+
"沉声",
|
|
198
|
+
"喃喃",
|
|
199
|
+
"冷笑",
|
|
200
|
+
"怒",
|
|
201
|
+
"叹",
|
|
202
|
+
"喝",
|
|
203
|
+
"笑",
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
COMMON_SURNAMES_1: Set[str] = set(
|
|
207
|
+
list(
|
|
208
|
+
"赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卞齐康伍余元卜顾孟平黄和穆萧尹姚邵湛汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董梁杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯昝管卢莫经房裘缪干解应宗丁宣贲邓郁单杭洪包诸左石崔吉龚程嵇邢滑裴陆荣翁荀羊於惠甄曲家封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗山谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘景詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲邰从鄂索咸籍赖卓蔺屠蒙池乔阴欎胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍却璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴瞿阎充慕连茹习宦艾鱼容向古易慎戈廖庾终暨居衡步都耿满弘匡国文寇广禄阙东欧殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾毋沙乜养鞠须丰巢关蒯相查后荆红游竺权逯盖益桓公"
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
COMMON_SURNAMES_2: Set[str] = {
|
|
213
|
+
"欧阳",
|
|
214
|
+
"司马",
|
|
215
|
+
"上官",
|
|
216
|
+
"诸葛",
|
|
217
|
+
"东方",
|
|
218
|
+
"南宫",
|
|
219
|
+
"西门",
|
|
220
|
+
"令狐",
|
|
221
|
+
"皇甫",
|
|
222
|
+
"尉迟",
|
|
223
|
+
"公孙",
|
|
224
|
+
"慕容",
|
|
225
|
+
"长孙",
|
|
226
|
+
"夏侯",
|
|
227
|
+
"轩辕",
|
|
228
|
+
"钟离",
|
|
229
|
+
"宇文",
|
|
230
|
+
"司徒",
|
|
231
|
+
"司空",
|
|
232
|
+
"太史",
|
|
233
|
+
"端木",
|
|
234
|
+
"申屠",
|
|
235
|
+
"公羊",
|
|
236
|
+
"澹台",
|
|
237
|
+
"公冶",
|
|
238
|
+
"宗政",
|
|
239
|
+
"濮阳",
|
|
240
|
+
"淳于",
|
|
241
|
+
"单于",
|
|
242
|
+
"太叔",
|
|
243
|
+
"仲孙",
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
CHAR_STOPWORDS: Set[str] = {
|
|
247
|
+
# very common generic mentions
|
|
248
|
+
"主角",
|
|
249
|
+
"众人",
|
|
250
|
+
"众",
|
|
251
|
+
"众妖",
|
|
252
|
+
"众修",
|
|
253
|
+
"人群",
|
|
254
|
+
"大家",
|
|
255
|
+
"所有人",
|
|
256
|
+
"他们",
|
|
257
|
+
"她们",
|
|
258
|
+
"我们",
|
|
259
|
+
"你们",
|
|
260
|
+
"自己",
|
|
261
|
+
"此时",
|
|
262
|
+
"这一刻",
|
|
263
|
+
"片刻",
|
|
264
|
+
"不久",
|
|
265
|
+
"然后",
|
|
266
|
+
"忽然",
|
|
267
|
+
"突然",
|
|
268
|
+
"因为",
|
|
269
|
+
"所以",
|
|
270
|
+
"同时",
|
|
271
|
+
"于是",
|
|
272
|
+
"但是",
|
|
273
|
+
"不过",
|
|
274
|
+
"如果",
|
|
275
|
+
"只是",
|
|
276
|
+
"仍然",
|
|
277
|
+
"仿佛",
|
|
278
|
+
"宛如",
|
|
279
|
+
"莫名",
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
TIME_RELATIVE = [
|
|
284
|
+
"翌日",
|
|
285
|
+
"次日",
|
|
286
|
+
"当日",
|
|
287
|
+
"当晚",
|
|
288
|
+
"今夜",
|
|
289
|
+
"昨夜",
|
|
290
|
+
"清晨",
|
|
291
|
+
"黎明",
|
|
292
|
+
"天明",
|
|
293
|
+
"天亮",
|
|
294
|
+
"正午",
|
|
295
|
+
"午后",
|
|
296
|
+
"黄昏",
|
|
297
|
+
"傍晚",
|
|
298
|
+
"夜里",
|
|
299
|
+
"午夜",
|
|
300
|
+
"半夜",
|
|
301
|
+
"三更",
|
|
302
|
+
"片刻后",
|
|
303
|
+
"不久后",
|
|
304
|
+
"数日后",
|
|
305
|
+
"几日后",
|
|
306
|
+
"三日后",
|
|
307
|
+
]
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
EVENT_TRIGGERS = [
|
|
311
|
+
"爆发",
|
|
312
|
+
"开战",
|
|
313
|
+
"大战",
|
|
314
|
+
"决战",
|
|
315
|
+
"身亡",
|
|
316
|
+
"死亡",
|
|
317
|
+
"失踪",
|
|
318
|
+
"现身",
|
|
319
|
+
"出现",
|
|
320
|
+
"突破",
|
|
321
|
+
"晋升",
|
|
322
|
+
"崩塌",
|
|
323
|
+
"坍塌",
|
|
324
|
+
"倒塌",
|
|
325
|
+
"结盟",
|
|
326
|
+
"背叛",
|
|
327
|
+
"叛变",
|
|
328
|
+
"揭露",
|
|
329
|
+
"曝光",
|
|
330
|
+
"宣布",
|
|
331
|
+
"宣告",
|
|
332
|
+
]
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
@dataclass
|
|
336
|
+
class Mention:
|
|
337
|
+
line: int
|
|
338
|
+
snippet: str
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
@dataclass
|
|
342
|
+
class Entity:
|
|
343
|
+
text: str
|
|
344
|
+
confidence: str
|
|
345
|
+
mentions: List[Mention] = field(default_factory=list)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _confidence_for_time(token: str) -> str:
|
|
349
|
+
if re.search(r"[0-9一二三四五六七八九十百千]+(年|月|日|天|旬|更|刻)", token):
|
|
350
|
+
return "high"
|
|
351
|
+
if token in TIME_RELATIVE:
|
|
352
|
+
return "medium"
|
|
353
|
+
return "low"
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _confidence_for_location(token: str) -> str:
|
|
357
|
+
for suf in LOCATION_SUFFIXES:
|
|
358
|
+
if token.endswith(suf) and len(token) >= 3:
|
|
359
|
+
return "high"
|
|
360
|
+
if token.startswith("【") and token.endswith("】"):
|
|
361
|
+
return "medium"
|
|
362
|
+
return "low"
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _confidence_for_character(name: str, freq: int, speech_hits: int) -> str:
|
|
366
|
+
if speech_hits >= 2:
|
|
367
|
+
return "high"
|
|
368
|
+
if freq >= 4:
|
|
369
|
+
return "medium"
|
|
370
|
+
return "low"
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _confidence_for_event(token: str) -> str:
|
|
374
|
+
if any(token.endswith(t) for t in EVENT_TRIGGERS):
|
|
375
|
+
return "medium"
|
|
376
|
+
return "low"
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _add_mention(store: Dict[str, List[Mention]], key: str, line_no: int, snippet: str, cap: int = 5) -> None:
|
|
380
|
+
mentions = store.setdefault(key, [])
|
|
381
|
+
if len(mentions) >= cap:
|
|
382
|
+
return
|
|
383
|
+
snippet = _truncate(snippet)
|
|
384
|
+
if any(m.line == line_no and m.snippet == snippet for m in mentions):
|
|
385
|
+
return
|
|
386
|
+
mentions.append(Mention(line=line_no, snippet=snippet))
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _sort_entities(entities: List[Entity]) -> List[Entity]:
|
|
390
|
+
# stable ordering: by mention count desc, then by text
|
|
391
|
+
return sorted(entities, key=lambda e: (-len(e.mentions), e.text))
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _extract_time_markers(lines: List[Tuple[int, str]]) -> Tuple[Dict[str, int], Dict[str, List[Mention]]]:
|
|
395
|
+
counts: Dict[str, int] = {}
|
|
396
|
+
mentions: Dict[str, List[Mention]] = {}
|
|
397
|
+
|
|
398
|
+
patterns = [
|
|
399
|
+
# explicit year + optional season
|
|
400
|
+
re.compile(r"(?:第)?[0-9一二三四五六七八九十百千]{1,4}年(?:[春夏秋冬](?:初|中|末)?)?"),
|
|
401
|
+
# month/day-ish
|
|
402
|
+
re.compile(r"(?:第)?[0-9一二三四五六七八九十]{1,3}(?:月|日|天|旬)"),
|
|
403
|
+
# relative tokens
|
|
404
|
+
re.compile(r"(" + "|".join(map(re.escape, TIME_RELATIVE)) + r")"),
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
for line_no, line in lines:
|
|
408
|
+
for pat in patterns:
|
|
409
|
+
for m in pat.findall(line):
|
|
410
|
+
token = m if isinstance(m, str) else m[0]
|
|
411
|
+
token = token.strip()
|
|
412
|
+
if not token:
|
|
413
|
+
continue
|
|
414
|
+
counts[token] = counts.get(token, 0) + 1
|
|
415
|
+
_add_mention(mentions, token, line_no, line)
|
|
416
|
+
|
|
417
|
+
return counts, mentions
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _extract_locations(lines: List[Tuple[int, str]]) -> Tuple[Dict[str, int], Dict[str, List[Mention]]]:
|
|
421
|
+
counts: Dict[str, int] = {}
|
|
422
|
+
mentions: Dict[str, List[Mention]] = {}
|
|
423
|
+
|
|
424
|
+
suffix_re = "|".join(sorted(map(re.escape, LOCATION_SUFFIXES), key=len, reverse=True))
|
|
425
|
+
pat = re.compile(rf"([\u3400-\u9fff]{{2,10}}(?:{suffix_re}))")
|
|
426
|
+
bracket_pat = re.compile(r"【([^】]{2,12})】")
|
|
427
|
+
|
|
428
|
+
weak_single = {"在", "于", "到", "往", "向", "朝"}
|
|
429
|
+
strict_candidate_pat = re.compile(rf"^[\u3400-\u9fff]{{2,10}}(?:{suffix_re})$")
|
|
430
|
+
loose_candidate_pat = re.compile(rf"^[\u3400-\u9fff]{{1,10}}(?:{suffix_re})$")
|
|
431
|
+
|
|
432
|
+
def normalize(token: str) -> str:
|
|
433
|
+
token = token.strip()
|
|
434
|
+
if not token:
|
|
435
|
+
return token
|
|
436
|
+
|
|
437
|
+
# Strip the earliest trigger found in the token, but avoid
|
|
438
|
+
# corrupting real location names that start with a preposition-like
|
|
439
|
+
# character (e.g. "向阳村", "朝阳城", "于都城").
|
|
440
|
+
best_pos: Optional[int] = None
|
|
441
|
+
best_end: Optional[int] = None
|
|
442
|
+
for trig in LOCATION_PREFIX_TRIGGERS:
|
|
443
|
+
idx = token.find(trig)
|
|
444
|
+
if idx == -1:
|
|
445
|
+
continue
|
|
446
|
+
end = idx + len(trig)
|
|
447
|
+
if end >= len(token):
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
candidate = token[end:]
|
|
451
|
+
if not candidate:
|
|
452
|
+
continue
|
|
453
|
+
|
|
454
|
+
is_weak = len(trig) == 1 and trig in weak_single
|
|
455
|
+
if is_weak:
|
|
456
|
+
# Only strip weak single-char triggers when the remainder still
|
|
457
|
+
# looks like a >=3-char place name (>=2 chars before suffix).
|
|
458
|
+
if not strict_candidate_pat.match(candidate):
|
|
459
|
+
continue
|
|
460
|
+
else:
|
|
461
|
+
if not loose_candidate_pat.match(candidate):
|
|
462
|
+
continue
|
|
463
|
+
|
|
464
|
+
if best_pos is None or idx < best_pos or (idx == best_pos and end > best_end):
|
|
465
|
+
best_pos = idx
|
|
466
|
+
best_end = end
|
|
467
|
+
|
|
468
|
+
if best_end is not None and best_end < len(token):
|
|
469
|
+
token = token[best_end:]
|
|
470
|
+
|
|
471
|
+
return token
|
|
472
|
+
|
|
473
|
+
for line_no, line in lines:
|
|
474
|
+
for token in pat.findall(line):
|
|
475
|
+
token = normalize(token)
|
|
476
|
+
if not token:
|
|
477
|
+
continue
|
|
478
|
+
if not any(token.endswith(suf) for suf in LOCATION_SUFFIXES):
|
|
479
|
+
continue
|
|
480
|
+
counts[token] = counts.get(token, 0) + 1
|
|
481
|
+
_add_mention(mentions, token, line_no, line)
|
|
482
|
+
|
|
483
|
+
for inner in bracket_pat.findall(line):
|
|
484
|
+
inner = inner.strip()
|
|
485
|
+
if not inner:
|
|
486
|
+
continue
|
|
487
|
+
# only treat bracket tokens as location if it looks like one
|
|
488
|
+
if any(inner.endswith(suf) for suf in LOCATION_SUFFIXES):
|
|
489
|
+
token = f"【{inner}】"
|
|
490
|
+
counts[token] = counts.get(token, 0) + 1
|
|
491
|
+
_add_mention(mentions, token, line_no, line)
|
|
492
|
+
|
|
493
|
+
return counts, mentions
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _extract_character_candidates(lines: List[Tuple[int, str]]) -> Tuple[Dict[str, int], Dict[str, int], Dict[str, List[Mention]]]:
|
|
497
|
+
counts: Dict[str, int] = {}
|
|
498
|
+
speech_hits: Dict[str, int] = {}
|
|
499
|
+
mentions: Dict[str, List[Mention]] = {}
|
|
500
|
+
|
|
501
|
+
# Prefer patterns like "林枫(沉声)道/说道/问道..." for high-confidence names.
|
|
502
|
+
speech_suffix_re = "|".join(map(re.escape, SPEECH_ENDINGS))
|
|
503
|
+
speech_mod_re = "|".join(map(re.escape, SPEECH_MODIFIERS))
|
|
504
|
+
speech_name_pat = re.compile(
|
|
505
|
+
rf"(?:^|(?<=[,。!?;:、\s\"「『(]))([\u3400-\u9fff]{{2,3}})(?:(?:{speech_mod_re}))?(?:{speech_suffix_re})"
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
token_pat = re.compile(r"([\u3400-\u9fff]{2,3})")
|
|
509
|
+
|
|
510
|
+
for line_no, line in lines:
|
|
511
|
+
# high-confidence: name + speech verb patterns
|
|
512
|
+
for token in speech_name_pat.findall(line):
|
|
513
|
+
token = token.strip()
|
|
514
|
+
if not token or token in CHAR_STOPWORDS:
|
|
515
|
+
continue
|
|
516
|
+
counts[token] = counts.get(token, 0) + 2
|
|
517
|
+
speech_hits[token] = speech_hits.get(token, 0) + 2
|
|
518
|
+
_add_mention(mentions, token, line_no, line)
|
|
519
|
+
|
|
520
|
+
has_speech = any(v in line for v in SPEECH_VERBS) or ("“" in line and "”" in line)
|
|
521
|
+
for token in token_pat.findall(line):
|
|
522
|
+
if token in CHAR_STOPWORDS:
|
|
523
|
+
continue
|
|
524
|
+
if any(token.endswith(suf) for suf in LOCATION_SUFFIXES):
|
|
525
|
+
continue
|
|
526
|
+
if re.match(r"^[春夏秋冬][初中末]?$", token) or re.match(r"^(?:初|仲|暮|孟)[春夏秋冬]$", token):
|
|
527
|
+
continue
|
|
528
|
+
if token.endswith("道") and token not in {"道长"}:
|
|
529
|
+
continue
|
|
530
|
+
if token[-1] in {"低", "轻", "沉", "喃", "冷", "怒", "叹", "喝", "笑"}:
|
|
531
|
+
continue
|
|
532
|
+
|
|
533
|
+
# surname heuristic: reduce noise from arbitrary 2-3 char phrases
|
|
534
|
+
if len(token) == 2 and token[0] not in COMMON_SURNAMES_1:
|
|
535
|
+
continue
|
|
536
|
+
if len(token) == 3 and token[:2] not in COMMON_SURNAMES_2 and token[0] not in COMMON_SURNAMES_1:
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
counts[token] = counts.get(token, 0) + 1
|
|
540
|
+
if has_speech:
|
|
541
|
+
speech_hits[token] = speech_hits.get(token, 0) + 1
|
|
542
|
+
_add_mention(mentions, token, line_no, line)
|
|
543
|
+
|
|
544
|
+
# filter: require min frequency
|
|
545
|
+
kept = {k for k, v in counts.items() if v >= 2 or speech_hits.get(k, 0) >= 1}
|
|
546
|
+
counts = {k: counts[k] for k in kept}
|
|
547
|
+
# keep mentions only for kept tokens
|
|
548
|
+
mentions = {k: mentions[k] for k in kept if k in mentions}
|
|
549
|
+
speech_hits = {k: speech_hits.get(k, 0) for k in kept}
|
|
550
|
+
|
|
551
|
+
return counts, speech_hits, mentions
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _extract_events(lines: List[Tuple[int, str]]) -> Tuple[Dict[str, int], Dict[str, List[Mention]]]:
|
|
555
|
+
counts: Dict[str, int] = {}
|
|
556
|
+
mentions: Dict[str, List[Mention]] = {}
|
|
557
|
+
|
|
558
|
+
trigger_re = "|".join(map(re.escape, EVENT_TRIGGERS))
|
|
559
|
+
pat = re.compile(rf"([\u3400-\u9fff]{{2,8}}(?:{trigger_re}))")
|
|
560
|
+
|
|
561
|
+
for line_no, line in lines:
|
|
562
|
+
for token in pat.findall(line):
|
|
563
|
+
token = token.strip()
|
|
564
|
+
if not token:
|
|
565
|
+
continue
|
|
566
|
+
if token in CHAR_STOPWORDS:
|
|
567
|
+
continue
|
|
568
|
+
counts[token] = counts.get(token, 0) + 1
|
|
569
|
+
_add_mention(mentions, token, line_no, line)
|
|
570
|
+
|
|
571
|
+
# limit extremely noisy outputs
|
|
572
|
+
counts = {k: v for k, v in counts.items() if len(k) <= 18}
|
|
573
|
+
mentions = {k: mentions[k] for k in counts.keys() if k in mentions}
|
|
574
|
+
return counts, mentions
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def _build_entities(
|
|
578
|
+
counts: Dict[str, int],
|
|
579
|
+
mentions: Dict[str, List[Mention]],
|
|
580
|
+
confidence_fn: Union[Callable[[str], str], Callable[[str, int, int], str]],
|
|
581
|
+
extra: Optional[Dict[str, int]] = None,
|
|
582
|
+
limit: int = 30,
|
|
583
|
+
) -> List[Entity]:
|
|
584
|
+
items: List[Tuple[str, int]] = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))
|
|
585
|
+
out: List[Entity] = []
|
|
586
|
+
for text, _cnt in items:
|
|
587
|
+
if len(out) >= limit:
|
|
588
|
+
break
|
|
589
|
+
ms = mentions.get(text, [])
|
|
590
|
+
conf = confidence_fn(text) if extra is None else confidence_fn(text, counts[text], extra.get(text, 0))
|
|
591
|
+
out.append(Entity(text=text, confidence=conf, mentions=ms))
|
|
592
|
+
return _sort_entities(out)
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def main() -> None:
|
|
596
|
+
chapter_path = sys.argv[1]
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
with open(chapter_path, "r", encoding="utf-8-sig") as f:
|
|
600
|
+
raw = f.read()
|
|
601
|
+
except Exception as e:
|
|
602
|
+
_die(f"run-ner.sh: failed to read chapter: {e}", 1)
|
|
603
|
+
|
|
604
|
+
raw_lines = raw.splitlines()
|
|
605
|
+
narrative_lines = _strip_markdown_lines(raw_lines)
|
|
606
|
+
|
|
607
|
+
time_counts, time_mentions = _extract_time_markers(narrative_lines)
|
|
608
|
+
loc_counts, loc_mentions = _extract_locations(narrative_lines)
|
|
609
|
+
char_counts, char_speech_hits, char_mentions = _extract_character_candidates(narrative_lines)
|
|
610
|
+
event_counts, event_mentions = _extract_events(narrative_lines)
|
|
611
|
+
|
|
612
|
+
characters = _build_entities(char_counts, char_mentions, _confidence_for_character, extra=char_speech_hits, limit=30)
|
|
613
|
+
locations = _build_entities(loc_counts, loc_mentions, _confidence_for_location, limit=30)
|
|
614
|
+
time_markers = _build_entities(time_counts, time_mentions, _confidence_for_time, limit=20)
|
|
615
|
+
events = _build_entities(event_counts, event_mentions, _confidence_for_event, limit=20)
|
|
616
|
+
|
|
617
|
+
out = {
|
|
618
|
+
"schema_version": 1,
|
|
619
|
+
"chapter_path": chapter_path,
|
|
620
|
+
"entities": {
|
|
621
|
+
"characters": [
|
|
622
|
+
{
|
|
623
|
+
"text": e.text,
|
|
624
|
+
"slug_id": None,
|
|
625
|
+
"confidence": e.confidence,
|
|
626
|
+
"mentions": [{"line": m.line, "snippet": m.snippet} for m in e.mentions],
|
|
627
|
+
}
|
|
628
|
+
for e in characters
|
|
629
|
+
],
|
|
630
|
+
"locations": [
|
|
631
|
+
{
|
|
632
|
+
"text": e.text,
|
|
633
|
+
"confidence": e.confidence,
|
|
634
|
+
"mentions": [{"line": m.line, "snippet": m.snippet} for m in e.mentions],
|
|
635
|
+
}
|
|
636
|
+
for e in locations
|
|
637
|
+
],
|
|
638
|
+
"time_markers": [
|
|
639
|
+
{
|
|
640
|
+
"text": e.text,
|
|
641
|
+
# TODO: implement actual time normalization; currently identity mapping
|
|
642
|
+
"normalized": e.text,
|
|
643
|
+
"confidence": e.confidence,
|
|
644
|
+
"mentions": [{"line": m.line, "snippet": m.snippet} for m in e.mentions],
|
|
645
|
+
}
|
|
646
|
+
for e in time_markers
|
|
647
|
+
],
|
|
648
|
+
"events": [
|
|
649
|
+
{
|
|
650
|
+
"text": e.text,
|
|
651
|
+
"confidence": e.confidence,
|
|
652
|
+
"mentions": [{"line": m.line, "snippet": m.snippet} for m in e.mentions],
|
|
653
|
+
}
|
|
654
|
+
for e in events
|
|
655
|
+
],
|
|
656
|
+
},
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
sys.stdout.write(json.dumps(out, ensure_ascii=False) + "\n")
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
try:
|
|
663
|
+
main()
|
|
664
|
+
except SystemExit:
|
|
665
|
+
raise
|
|
666
|
+
except Exception as e:
|
|
667
|
+
sys.stderr.write(f"run-ner.sh: unexpected error: {e}\n")
|
|
668
|
+
raise SystemExit(2)
|
|
669
|
+
PY
|