novel-writer-cli 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +103 -0
  3. package/agents/chapter-writer.md +142 -0
  4. package/agents/character-weaver.md +117 -0
  5. package/agents/consistency-auditor.md +85 -0
  6. package/agents/plot-architect.md +128 -0
  7. package/agents/quality-judge.md +232 -0
  8. package/agents/style-analyzer.md +109 -0
  9. package/agents/style-refiner.md +97 -0
  10. package/agents/summarizer.md +128 -0
  11. package/agents/world-builder.md +161 -0
  12. package/dist/__tests__/character-voice.test.js +445 -0
  13. package/dist/__tests__/commit-prototype-pollution.test.js +45 -0
  14. package/dist/__tests__/engagement.test.js +382 -0
  15. package/dist/__tests__/foreshadow-visibility.test.js +131 -0
  16. package/dist/__tests__/hook-ledger.test.js +1028 -0
  17. package/dist/__tests__/naming-lint.test.js +132 -0
  18. package/dist/__tests__/narrative-health-injection.test.js +359 -0
  19. package/dist/__tests__/next-step-prejudge-guardrails.test.js +325 -0
  20. package/dist/__tests__/next-step-title-fix.test.js +153 -0
  21. package/dist/__tests__/platform-profile.test.js +274 -0
  22. package/dist/__tests__/promise-ledger.test.js +189 -0
  23. package/dist/__tests__/readability-lint.test.js +209 -0
  24. package/dist/__tests__/text-utils.test.js +39 -0
  25. package/dist/__tests__/title-policy.test.js +147 -0
  26. package/dist/advance.js +75 -0
  27. package/dist/character-voice.js +805 -0
  28. package/dist/checkpoint.js +126 -0
  29. package/dist/cli.js +563 -0
  30. package/dist/cliche-lint.js +515 -0
  31. package/dist/commit.js +1460 -0
  32. package/dist/consistency-auditor.js +684 -0
  33. package/dist/engagement.js +687 -0
  34. package/dist/errors.js +7 -0
  35. package/dist/fingerprint.js +16 -0
  36. package/dist/foreshadow-visibility.js +214 -0
  37. package/dist/fs-utils.js +68 -0
  38. package/dist/hook-ledger.js +721 -0
  39. package/dist/hook-policy.js +107 -0
  40. package/dist/instruction-gates.js +51 -0
  41. package/dist/instructions.js +406 -0
  42. package/dist/latest-summary-loader.js +29 -0
  43. package/dist/lock.js +121 -0
  44. package/dist/naming-lint.js +531 -0
  45. package/dist/ner.js +73 -0
  46. package/dist/next-step.js +408 -0
  47. package/dist/novel-ask.js +270 -0
  48. package/dist/output.js +9 -0
  49. package/dist/platform-constraints.js +518 -0
  50. package/dist/platform-profile.js +325 -0
  51. package/dist/prejudge-guardrails.js +370 -0
  52. package/dist/project.js +40 -0
  53. package/dist/promise-ledger.js +723 -0
  54. package/dist/readability-lint.js +555 -0
  55. package/dist/safe-parse.js +36 -0
  56. package/dist/safe-path.js +29 -0
  57. package/dist/scoring-weights.js +290 -0
  58. package/dist/steps.js +60 -0
  59. package/dist/text-utils.js +18 -0
  60. package/dist/title-policy.js +251 -0
  61. package/dist/type-guards.js +6 -0
  62. package/dist/validate.js +131 -0
  63. package/docs/user/README.md +17 -0
  64. package/docs/user/guardrails.md +179 -0
  65. package/docs/user/interactive-gates.md +124 -0
  66. package/docs/user/novel-cli.md +289 -0
  67. package/docs/user/ops.md +123 -0
  68. package/docs/user/quick-start.md +97 -0
  69. package/docs/user/spec-system.md +166 -0
  70. package/docs/user/storylines.md +144 -0
  71. package/package.json +48 -0
  72. package/schemas/README.md +18 -0
  73. package/schemas/character-voice-drift.schema.json +135 -0
  74. package/schemas/character-voice-profiles.schema.json +141 -0
  75. package/schemas/engagement-metrics.schema.json +38 -0
  76. package/schemas/hook-ledger.schema.json +108 -0
  77. package/schemas/platform-profile.schema.json +235 -0
  78. package/schemas/promise-ledger.schema.json +97 -0
  79. package/scripts/calibrate-quality-judge.sh +91 -0
  80. package/scripts/compare-regression-runs.sh +86 -0
  81. package/scripts/lib/_common.py +131 -0
  82. package/scripts/lib/calibrate_quality_judge.py +312 -0
  83. package/scripts/lib/compare_regression_runs.py +142 -0
  84. package/scripts/lib/run_regression.py +621 -0
  85. package/scripts/lint-blacklist.sh +201 -0
  86. package/scripts/lint-cliche.sh +370 -0
  87. package/scripts/lint-readability.sh +404 -0
  88. package/scripts/query-foreshadow.sh +252 -0
  89. package/scripts/run-ner.sh +669 -0
  90. package/scripts/run-regression.sh +122 -0
  91. package/skills/cli-step/SKILL.md +158 -0
  92. package/skills/continue/SKILL.md +348 -0
  93. package/skills/continue/references/context-contracts.md +169 -0
  94. package/skills/continue/references/continuity-checks.md +187 -0
  95. package/skills/continue/references/file-protocols.md +64 -0
  96. package/skills/continue/references/foreshadowing.md +130 -0
  97. package/skills/continue/references/gate-decision.md +53 -0
  98. package/skills/continue/references/periodic-maintenance.md +46 -0
  99. package/skills/novel-writing/SKILL.md +77 -0
  100. package/skills/novel-writing/references/quality-rubric.md +140 -0
  101. package/skills/novel-writing/references/style-guide.md +145 -0
  102. package/skills/start/SKILL.md +458 -0
  103. package/skills/start/references/quality-review.md +86 -0
  104. package/skills/start/references/setting-update.md +44 -0
  105. package/skills/start/references/vol-planning.md +61 -0
  106. package/skills/start/references/vol-review.md +58 -0
  107. package/skills/status/SKILL.md +116 -0
  108. package/skills/status/references/sample-output.md +60 -0
  109. package/templates/ai-blacklist.json +79 -0
  110. package/templates/brief-template.md +46 -0
  111. package/templates/genre-weight-profiles.json +90 -0
  112. package/templates/novel-ask/example.answer.json +12 -0
  113. package/templates/novel-ask/example.question.json +51 -0
  114. package/templates/platform-profile.json +148 -0
  115. package/templates/style-profile-template.json +58 -0
  116. package/templates/web-novel-cliche-lint.json +41 -0
@@ -0,0 +1,669 @@
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Deterministic-ish Chinese NER extractor (M3+ extension point).
4
+ #
5
+ # Usage:
6
+ # run-ner.sh <chapter.md>
7
+ #
8
+ # Output:
9
+ # stdout JSON (exit 0 on success)
10
+ #
11
+ # Exit codes:
12
+ # 0 = success (valid JSON emitted to stdout)
13
+ # 1 = validation failure (bad args, missing files)
14
+ # 2 = script exception (unexpected runtime error)
15
+ #
16
+ # Notes:
17
+ # - This script is designed to be fast and regression-friendly (stable output ordering).
18
+ # - It is NOT a perfect NER model. It emits candidates + evidence snippets for LLM verification.
19
+
20
+ set -euo pipefail
21
+
22
+ if [ "$#" -ne 1 ]; then
23
+ echo "Usage: run-ner.sh <chapter.md>" >&2
24
+ exit 1
25
+ fi
26
+
27
+ chapter_path="$1"
28
+
29
+ if [ ! -f "$chapter_path" ]; then
30
+ echo "run-ner.sh: chapter file not found: $chapter_path" >&2
31
+ exit 1
32
+ fi
33
+
34
+ if ! command -v python3 >/dev/null 2>&1; then
35
+ echo "run-ner.sh: python3 is required but not found" >&2
36
+ exit 2
37
+ fi
38
+
39
+ if ! python3 -c "import sys; sys.exit(0 if sys.version_info >= (3, 7) else 1)" 2>/dev/null; then
40
+ echo "run-ner.sh: python3 >= 3.7 is required" >&2
41
+ exit 2
42
+ fi
43
+
44
+ python3 - "$chapter_path" <<'PY'
45
+ import json
46
+ import re
47
+ import sys
48
+ from dataclasses import dataclass, field
49
+ from typing import Callable, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
50
+
51
+
52
+ def _die(msg: str, exit_code: int) -> None:
53
+ sys.stderr.write(msg.rstrip() + "\n")
54
+ raise SystemExit(exit_code)
55
+
56
+
57
+ def _truncate(s: str, limit: int = 160) -> str:
58
+ s = s.strip()
59
+ if len(s) <= limit:
60
+ return s
61
+ return s[: limit - 1] + "…"
62
+
63
+
64
+ def _strip_markdown_lines(lines: Sequence[str]) -> List[Tuple[int, str]]:
65
+ """
66
+ Best-effort strip of non-narrative markdown:
67
+ - code fences
68
+ - headings
69
+ - horizontal rules
70
+ """
71
+ out: List[Tuple[int, str]] = []
72
+ in_fence = False
73
+ for idx, raw in enumerate(lines, start=1):
74
+ line = raw.rstrip("\n")
75
+ stripped = line.strip()
76
+
77
+ if stripped.startswith("```"):
78
+ in_fence = not in_fence
79
+ continue
80
+ if in_fence:
81
+ continue
82
+
83
+ if stripped.startswith("#"):
84
+ continue
85
+ if stripped in {"---", "___", "***"}:
86
+ continue
87
+
88
+ if not stripped:
89
+ continue
90
+
91
+ out.append((idx, line))
92
+ return out
93
+
94
+
95
+ LOCATION_SUFFIXES = [
96
+ "城",
97
+ "镇",
98
+ "村",
99
+ "山",
100
+ "岭",
101
+ "谷",
102
+ "林",
103
+ "森林",
104
+ "原",
105
+ "原野",
106
+ "宫",
107
+ "殿",
108
+ "府",
109
+ "楼",
110
+ "阁",
111
+ "寺",
112
+ "观",
113
+ "院",
114
+ "洞",
115
+ "湖",
116
+ "海",
117
+ "江",
118
+ "河",
119
+ "关",
120
+ "门",
121
+ "岛",
122
+ "州",
123
+ "国",
124
+ "郡",
125
+ "坊",
126
+ "街",
127
+ "巷",
128
+ "庄",
129
+ "堡",
130
+ "营",
131
+ "港",
132
+ "岸",
133
+ "崖",
134
+ "狱",
135
+ ]
136
+
137
+ LOCATION_PREFIX_TRIGGERS = [
138
+ # prepositions / verbs that often prefix a location mention
139
+ "来到",
140
+ "到了",
141
+ "到达",
142
+ "进入",
143
+ "踏入",
144
+ "走进",
145
+ "走入",
146
+ "抵达",
147
+ "赶到",
148
+ "前往",
149
+ "奔向",
150
+ "穿过",
151
+ "越过",
152
+ "飞入",
153
+ "潜入",
154
+ "驶入",
155
+ "闯入",
156
+ "返回",
157
+ "回到",
158
+ "离开",
159
+ "在",
160
+ "于",
161
+ "往",
162
+ "向",
163
+ "朝",
164
+ ]
165
+
166
+ SPEECH_VERBS = [
167
+ "说道",
168
+ "问道",
169
+ "答道",
170
+ "笑道",
171
+ "冷笑",
172
+ "喝道",
173
+ "低声",
174
+ "轻声",
175
+ "沉声",
176
+ "喃喃",
177
+ "叹道",
178
+ "怒道",
179
+ "喊道",
180
+ ]
181
+
182
+ SPEECH_ENDINGS = [
183
+ "说道",
184
+ "问道",
185
+ "答道",
186
+ "笑道",
187
+ "喝道",
188
+ "叹道",
189
+ "怒道",
190
+ "喊道",
191
+ "道",
192
+ ]
193
+
194
+ SPEECH_MODIFIERS = [
195
+ "低声",
196
+ "轻声",
197
+ "沉声",
198
+ "喃喃",
199
+ "冷笑",
200
+ "怒",
201
+ "叹",
202
+ "喝",
203
+ "笑",
204
+ ]
205
+
206
+ COMMON_SURNAMES_1: Set[str] = set(
207
+ list(
208
+ "赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卞齐康伍余元卜顾孟平黄和穆萧尹姚邵湛汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董梁杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯昝管卢莫经房裘缪干解应宗丁宣贲邓郁单杭洪包诸左石崔吉龚程嵇邢滑裴陆荣翁荀羊於惠甄曲家封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗山谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘景詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲邰从鄂索咸籍赖卓蔺屠蒙池乔阴欎胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍却璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴瞿阎充慕连茹习宦艾鱼容向古易慎戈廖庾终暨居衡步都耿满弘匡国文寇广禄阙东欧殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾毋沙乜养鞠须丰巢关蒯相查后荆红游竺权逯盖益桓公"
209
+ )
210
+ )
211
+
212
+ COMMON_SURNAMES_2: Set[str] = {
213
+ "欧阳",
214
+ "司马",
215
+ "上官",
216
+ "诸葛",
217
+ "东方",
218
+ "南宫",
219
+ "西门",
220
+ "令狐",
221
+ "皇甫",
222
+ "尉迟",
223
+ "公孙",
224
+ "慕容",
225
+ "长孙",
226
+ "夏侯",
227
+ "轩辕",
228
+ "钟离",
229
+ "宇文",
230
+ "司徒",
231
+ "司空",
232
+ "太史",
233
+ "端木",
234
+ "申屠",
235
+ "公羊",
236
+ "澹台",
237
+ "公冶",
238
+ "宗政",
239
+ "濮阳",
240
+ "淳于",
241
+ "单于",
242
+ "太叔",
243
+ "仲孙",
244
+ }
245
+
246
+ CHAR_STOPWORDS: Set[str] = {
247
+ # very common generic mentions
248
+ "主角",
249
+ "众人",
250
+ "众",
251
+ "众妖",
252
+ "众修",
253
+ "人群",
254
+ "大家",
255
+ "所有人",
256
+ "他们",
257
+ "她们",
258
+ "我们",
259
+ "你们",
260
+ "自己",
261
+ "此时",
262
+ "这一刻",
263
+ "片刻",
264
+ "不久",
265
+ "然后",
266
+ "忽然",
267
+ "突然",
268
+ "因为",
269
+ "所以",
270
+ "同时",
271
+ "于是",
272
+ "但是",
273
+ "不过",
274
+ "如果",
275
+ "只是",
276
+ "仍然",
277
+ "仿佛",
278
+ "宛如",
279
+ "莫名",
280
+ }
281
+
282
+
283
+ TIME_RELATIVE = [
284
+ "翌日",
285
+ "次日",
286
+ "当日",
287
+ "当晚",
288
+ "今夜",
289
+ "昨夜",
290
+ "清晨",
291
+ "黎明",
292
+ "天明",
293
+ "天亮",
294
+ "正午",
295
+ "午后",
296
+ "黄昏",
297
+ "傍晚",
298
+ "夜里",
299
+ "午夜",
300
+ "半夜",
301
+ "三更",
302
+ "片刻后",
303
+ "不久后",
304
+ "数日后",
305
+ "几日后",
306
+ "三日后",
307
+ ]
308
+
309
+
310
+ EVENT_TRIGGERS = [
311
+ "爆发",
312
+ "开战",
313
+ "大战",
314
+ "决战",
315
+ "身亡",
316
+ "死亡",
317
+ "失踪",
318
+ "现身",
319
+ "出现",
320
+ "突破",
321
+ "晋升",
322
+ "崩塌",
323
+ "坍塌",
324
+ "倒塌",
325
+ "结盟",
326
+ "背叛",
327
+ "叛变",
328
+ "揭露",
329
+ "曝光",
330
+ "宣布",
331
+ "宣告",
332
+ ]
333
+
334
+
335
+ @dataclass
336
+ class Mention:
337
+ line: int
338
+ snippet: str
339
+
340
+
341
+ @dataclass
342
+ class Entity:
343
+ text: str
344
+ confidence: str
345
+ mentions: List[Mention] = field(default_factory=list)
346
+
347
+
348
+ def _confidence_for_time(token: str) -> str:
349
+ if re.search(r"[0-9一二三四五六七八九十百千]+(年|月|日|天|旬|更|刻)", token):
350
+ return "high"
351
+ if token in TIME_RELATIVE:
352
+ return "medium"
353
+ return "low"
354
+
355
+
356
+ def _confidence_for_location(token: str) -> str:
357
+ for suf in LOCATION_SUFFIXES:
358
+ if token.endswith(suf) and len(token) >= 3:
359
+ return "high"
360
+ if token.startswith("【") and token.endswith("】"):
361
+ return "medium"
362
+ return "low"
363
+
364
+
365
+ def _confidence_for_character(name: str, freq: int, speech_hits: int) -> str:
366
+ if speech_hits >= 2:
367
+ return "high"
368
+ if freq >= 4:
369
+ return "medium"
370
+ return "low"
371
+
372
+
373
+ def _confidence_for_event(token: str) -> str:
374
+ if any(token.endswith(t) for t in EVENT_TRIGGERS):
375
+ return "medium"
376
+ return "low"
377
+
378
+
379
+ def _add_mention(store: Dict[str, List[Mention]], key: str, line_no: int, snippet: str, cap: int = 5) -> None:
380
+ mentions = store.setdefault(key, [])
381
+ if len(mentions) >= cap:
382
+ return
383
+ snippet = _truncate(snippet)
384
+ if any(m.line == line_no and m.snippet == snippet for m in mentions):
385
+ return
386
+ mentions.append(Mention(line=line_no, snippet=snippet))
387
+
388
+
389
+ def _sort_entities(entities: List[Entity]) -> List[Entity]:
390
+ # stable ordering: by mention count desc, then by text
391
+ return sorted(entities, key=lambda e: (-len(e.mentions), e.text))
392
+
393
+
394
+ def _extract_time_markers(lines: List[Tuple[int, str]]) -> Tuple[Dict[str, int], Dict[str, List[Mention]]]:
395
+ counts: Dict[str, int] = {}
396
+ mentions: Dict[str, List[Mention]] = {}
397
+
398
+ patterns = [
399
+ # explicit year + optional season
400
+ re.compile(r"(?:第)?[0-9一二三四五六七八九十百千]{1,4}年(?:[春夏秋冬](?:初|中|末)?)?"),
401
+ # month/day-ish
402
+ re.compile(r"(?:第)?[0-9一二三四五六七八九十]{1,3}(?:月|日|天|旬)"),
403
+ # relative tokens
404
+ re.compile(r"(" + "|".join(map(re.escape, TIME_RELATIVE)) + r")"),
405
+ ]
406
+
407
+ for line_no, line in lines:
408
+ for pat in patterns:
409
+ for m in pat.findall(line):
410
+ token = m if isinstance(m, str) else m[0]
411
+ token = token.strip()
412
+ if not token:
413
+ continue
414
+ counts[token] = counts.get(token, 0) + 1
415
+ _add_mention(mentions, token, line_no, line)
416
+
417
+ return counts, mentions
418
+
419
+
420
+ def _extract_locations(lines: List[Tuple[int, str]]) -> Tuple[Dict[str, int], Dict[str, List[Mention]]]:
421
+ counts: Dict[str, int] = {}
422
+ mentions: Dict[str, List[Mention]] = {}
423
+
424
+ suffix_re = "|".join(sorted(map(re.escape, LOCATION_SUFFIXES), key=len, reverse=True))
425
+ pat = re.compile(rf"([\u3400-\u9fff]{{2,10}}(?:{suffix_re}))")
426
+ bracket_pat = re.compile(r"【([^】]{2,12})】")
427
+
428
+ weak_single = {"在", "于", "到", "往", "向", "朝"}
429
+ strict_candidate_pat = re.compile(rf"^[\u3400-\u9fff]{{2,10}}(?:{suffix_re})$")
430
+ loose_candidate_pat = re.compile(rf"^[\u3400-\u9fff]{{1,10}}(?:{suffix_re})$")
431
+
432
+ def normalize(token: str) -> str:
433
+ token = token.strip()
434
+ if not token:
435
+ return token
436
+
437
+ # Strip the earliest trigger found in the token, but avoid
438
+ # corrupting real location names that start with a preposition-like
439
+ # character (e.g. "向阳村", "朝阳城", "于都城").
440
+ best_pos: Optional[int] = None
441
+ best_end: Optional[int] = None
442
+ for trig in LOCATION_PREFIX_TRIGGERS:
443
+ idx = token.find(trig)
444
+ if idx == -1:
445
+ continue
446
+ end = idx + len(trig)
447
+ if end >= len(token):
448
+ continue
449
+
450
+ candidate = token[end:]
451
+ if not candidate:
452
+ continue
453
+
454
+ is_weak = len(trig) == 1 and trig in weak_single
455
+ if is_weak:
456
+ # Only strip weak single-char triggers when the remainder still
457
+ # looks like a >=3-char place name (>=2 chars before suffix).
458
+ if not strict_candidate_pat.match(candidate):
459
+ continue
460
+ else:
461
+ if not loose_candidate_pat.match(candidate):
462
+ continue
463
+
464
+ if best_pos is None or idx < best_pos or (idx == best_pos and end > best_end):
465
+ best_pos = idx
466
+ best_end = end
467
+
468
+ if best_end is not None and best_end < len(token):
469
+ token = token[best_end:]
470
+
471
+ return token
472
+
473
+ for line_no, line in lines:
474
+ for token in pat.findall(line):
475
+ token = normalize(token)
476
+ if not token:
477
+ continue
478
+ if not any(token.endswith(suf) for suf in LOCATION_SUFFIXES):
479
+ continue
480
+ counts[token] = counts.get(token, 0) + 1
481
+ _add_mention(mentions, token, line_no, line)
482
+
483
+ for inner in bracket_pat.findall(line):
484
+ inner = inner.strip()
485
+ if not inner:
486
+ continue
487
+ # only treat bracket tokens as location if it looks like one
488
+ if any(inner.endswith(suf) for suf in LOCATION_SUFFIXES):
489
+ token = f"【{inner}】"
490
+ counts[token] = counts.get(token, 0) + 1
491
+ _add_mention(mentions, token, line_no, line)
492
+
493
+ return counts, mentions
494
+
495
+
496
+ def _extract_character_candidates(lines: List[Tuple[int, str]]) -> Tuple[Dict[str, int], Dict[str, int], Dict[str, List[Mention]]]:
497
+ counts: Dict[str, int] = {}
498
+ speech_hits: Dict[str, int] = {}
499
+ mentions: Dict[str, List[Mention]] = {}
500
+
501
+ # Prefer patterns like "林枫(沉声)道/说道/问道..." for high-confidence names.
502
+ speech_suffix_re = "|".join(map(re.escape, SPEECH_ENDINGS))
503
+ speech_mod_re = "|".join(map(re.escape, SPEECH_MODIFIERS))
504
+ speech_name_pat = re.compile(
505
+ rf"(?:^|(?<=[,。!?;:、\s\"「『(]))([\u3400-\u9fff]{{2,3}})(?:(?:{speech_mod_re}))?(?:{speech_suffix_re})"
506
+ )
507
+
508
+ token_pat = re.compile(r"([\u3400-\u9fff]{2,3})")
509
+
510
+ for line_no, line in lines:
511
+ # high-confidence: name + speech verb patterns
512
+ for token in speech_name_pat.findall(line):
513
+ token = token.strip()
514
+ if not token or token in CHAR_STOPWORDS:
515
+ continue
516
+ counts[token] = counts.get(token, 0) + 2
517
+ speech_hits[token] = speech_hits.get(token, 0) + 2
518
+ _add_mention(mentions, token, line_no, line)
519
+
520
+ has_speech = any(v in line for v in SPEECH_VERBS) or ("“" in line and "”" in line)
521
+ for token in token_pat.findall(line):
522
+ if token in CHAR_STOPWORDS:
523
+ continue
524
+ if any(token.endswith(suf) for suf in LOCATION_SUFFIXES):
525
+ continue
526
+ if re.match(r"^[春夏秋冬][初中末]?$", token) or re.match(r"^(?:初|仲|暮|孟)[春夏秋冬]$", token):
527
+ continue
528
+ if token.endswith("道") and token not in {"道长"}:
529
+ continue
530
+ if token[-1] in {"低", "轻", "沉", "喃", "冷", "怒", "叹", "喝", "笑"}:
531
+ continue
532
+
533
+ # surname heuristic: reduce noise from arbitrary 2-3 char phrases
534
+ if len(token) == 2 and token[0] not in COMMON_SURNAMES_1:
535
+ continue
536
+ if len(token) == 3 and token[:2] not in COMMON_SURNAMES_2 and token[0] not in COMMON_SURNAMES_1:
537
+ continue
538
+
539
+ counts[token] = counts.get(token, 0) + 1
540
+ if has_speech:
541
+ speech_hits[token] = speech_hits.get(token, 0) + 1
542
+ _add_mention(mentions, token, line_no, line)
543
+
544
+ # filter: require min frequency
545
+ kept = {k for k, v in counts.items() if v >= 2 or speech_hits.get(k, 0) >= 1}
546
+ counts = {k: counts[k] for k in kept}
547
+ # keep mentions only for kept tokens
548
+ mentions = {k: mentions[k] for k in kept if k in mentions}
549
+ speech_hits = {k: speech_hits.get(k, 0) for k in kept}
550
+
551
+ return counts, speech_hits, mentions
552
+
553
+
554
+ def _extract_events(lines: List[Tuple[int, str]]) -> Tuple[Dict[str, int], Dict[str, List[Mention]]]:
555
+ counts: Dict[str, int] = {}
556
+ mentions: Dict[str, List[Mention]] = {}
557
+
558
+ trigger_re = "|".join(map(re.escape, EVENT_TRIGGERS))
559
+ pat = re.compile(rf"([\u3400-\u9fff]{{2,8}}(?:{trigger_re}))")
560
+
561
+ for line_no, line in lines:
562
+ for token in pat.findall(line):
563
+ token = token.strip()
564
+ if not token:
565
+ continue
566
+ if token in CHAR_STOPWORDS:
567
+ continue
568
+ counts[token] = counts.get(token, 0) + 1
569
+ _add_mention(mentions, token, line_no, line)
570
+
571
+ # limit extremely noisy outputs
572
+ counts = {k: v for k, v in counts.items() if len(k) <= 18}
573
+ mentions = {k: mentions[k] for k in counts.keys() if k in mentions}
574
+ return counts, mentions
575
+
576
+
577
+ def _build_entities(
578
+ counts: Dict[str, int],
579
+ mentions: Dict[str, List[Mention]],
580
+ confidence_fn: Union[Callable[[str], str], Callable[[str, int, int], str]],
581
+ extra: Optional[Dict[str, int]] = None,
582
+ limit: int = 30,
583
+ ) -> List[Entity]:
584
+ items: List[Tuple[str, int]] = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))
585
+ out: List[Entity] = []
586
+ for text, _cnt in items:
587
+ if len(out) >= limit:
588
+ break
589
+ ms = mentions.get(text, [])
590
+ conf = confidence_fn(text) if extra is None else confidence_fn(text, counts[text], extra.get(text, 0))
591
+ out.append(Entity(text=text, confidence=conf, mentions=ms))
592
+ return _sort_entities(out)
593
+
594
+
595
+ def main() -> None:
596
+ chapter_path = sys.argv[1]
597
+
598
+ try:
599
+ with open(chapter_path, "r", encoding="utf-8-sig") as f:
600
+ raw = f.read()
601
+ except Exception as e:
602
+ _die(f"run-ner.sh: failed to read chapter: {e}", 1)
603
+
604
+ raw_lines = raw.splitlines()
605
+ narrative_lines = _strip_markdown_lines(raw_lines)
606
+
607
+ time_counts, time_mentions = _extract_time_markers(narrative_lines)
608
+ loc_counts, loc_mentions = _extract_locations(narrative_lines)
609
+ char_counts, char_speech_hits, char_mentions = _extract_character_candidates(narrative_lines)
610
+ event_counts, event_mentions = _extract_events(narrative_lines)
611
+
612
+ characters = _build_entities(char_counts, char_mentions, _confidence_for_character, extra=char_speech_hits, limit=30)
613
+ locations = _build_entities(loc_counts, loc_mentions, _confidence_for_location, limit=30)
614
+ time_markers = _build_entities(time_counts, time_mentions, _confidence_for_time, limit=20)
615
+ events = _build_entities(event_counts, event_mentions, _confidence_for_event, limit=20)
616
+
617
+ out = {
618
+ "schema_version": 1,
619
+ "chapter_path": chapter_path,
620
+ "entities": {
621
+ "characters": [
622
+ {
623
+ "text": e.text,
624
+ "slug_id": None,
625
+ "confidence": e.confidence,
626
+ "mentions": [{"line": m.line, "snippet": m.snippet} for m in e.mentions],
627
+ }
628
+ for e in characters
629
+ ],
630
+ "locations": [
631
+ {
632
+ "text": e.text,
633
+ "confidence": e.confidence,
634
+ "mentions": [{"line": m.line, "snippet": m.snippet} for m in e.mentions],
635
+ }
636
+ for e in locations
637
+ ],
638
+ "time_markers": [
639
+ {
640
+ "text": e.text,
641
+ # TODO: implement actual time normalization; currently identity mapping
642
+ "normalized": e.text,
643
+ "confidence": e.confidence,
644
+ "mentions": [{"line": m.line, "snippet": m.snippet} for m in e.mentions],
645
+ }
646
+ for e in time_markers
647
+ ],
648
+ "events": [
649
+ {
650
+ "text": e.text,
651
+ "confidence": e.confidence,
652
+ "mentions": [{"line": m.line, "snippet": m.snippet} for m in e.mentions],
653
+ }
654
+ for e in events
655
+ ],
656
+ },
657
+ }
658
+
659
+ sys.stdout.write(json.dumps(out, ensure_ascii=False) + "\n")
660
+
661
+
662
+ try:
663
+ main()
664
+ except SystemExit:
665
+ raise
666
+ except Exception as e:
667
+ sys.stderr.write(f"run-ner.sh: unexpected error: {e}\n")
668
+ raise SystemExit(2)
669
+ PY