chinese-summary 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,596 @@
1
+ // src/chinese-summary.ts
2
+ var DEFAULT_OPTIONS = {
3
+ sentenceCount: 3,
4
+ compressionLevel: 3,
5
+ ngramSize: 2,
6
+ dampingFactor: 0.85,
7
+ maxIterations: 30,
8
+ convergenceThreshold: 1e-4,
9
+ weightFirstSentence: 1.5,
10
+ weightFirstParagraph: 1.2,
11
+ weightParagraphStart: 1.1,
12
+ weightParagraphEnd: 1.05,
13
+ minSentenceLength: 5,
14
+ minClauseLength: 3,
15
+ maxClauses: 3,
16
+ mmrLambda: 0.7,
17
+ keywordWeight: 1.2
18
+ };
19
+ function clampInt(value, min, max, fallback) {
20
+ if (typeof value !== "number" || !Number.isFinite(value)) return fallback;
21
+ return Math.max(min, Math.min(max, Math.floor(value)));
22
+ }
23
+ function clampFloat(value, min, max, fallback) {
24
+ if (typeof value !== "number" || !Number.isFinite(value)) return fallback;
25
+ return Math.max(min, Math.min(max, value));
26
+ }
27
+ function sanitizeCompressionLevel(value) {
28
+ if (typeof value === "number" && Number.isFinite(value)) {
29
+ const clamped = Math.max(1, Math.min(5, Math.floor(value)));
30
+ return clamped;
31
+ }
32
+ return 3;
33
+ }
34
+ function sanitizeOptions(options) {
35
+ return {
36
+ sentenceCount: clampInt(options.sentenceCount, 1, 100, DEFAULT_OPTIONS.sentenceCount),
37
+ compressionLevel: sanitizeCompressionLevel(options.compressionLevel),
38
+ ngramSize: clampInt(options.ngramSize, 1, 5, DEFAULT_OPTIONS.ngramSize),
39
+ dampingFactor: clampFloat(options.dampingFactor, 0.1, 0.95, DEFAULT_OPTIONS.dampingFactor),
40
+ maxIterations: clampInt(options.maxIterations, 1, 200, DEFAULT_OPTIONS.maxIterations),
41
+ convergenceThreshold: clampFloat(options.convergenceThreshold, 1e-8, 1, DEFAULT_OPTIONS.convergenceThreshold),
42
+ weightFirstSentence: clampFloat(options.weightFirstSentence, 0.5, 5, DEFAULT_OPTIONS.weightFirstSentence),
43
+ weightFirstParagraph: clampFloat(options.weightFirstParagraph, 0.5, 5, DEFAULT_OPTIONS.weightFirstParagraph),
44
+ weightParagraphStart: clampFloat(options.weightParagraphStart, 0.5, 5, DEFAULT_OPTIONS.weightParagraphStart),
45
+ weightParagraphEnd: clampFloat(options.weightParagraphEnd, 0.5, 5, DEFAULT_OPTIONS.weightParagraphEnd),
46
+ minSentenceLength: clampInt(options.minSentenceLength, 1, 100, DEFAULT_OPTIONS.minSentenceLength),
47
+ minClauseLength: clampInt(options.minClauseLength, 1, 50, DEFAULT_OPTIONS.minClauseLength),
48
+ maxClauses: clampInt(options.maxClauses, 1, 20, DEFAULT_OPTIONS.maxClauses),
49
+ mmrLambda: clampFloat(options.mmrLambda, 0.3, 1, DEFAULT_OPTIONS.mmrLambda),
50
+ keywordWeight: clampFloat(options.keywordWeight, 0, 5, DEFAULT_OPTIONS.keywordWeight)
51
+ };
52
+ }
53
+ function safeNumber(value, fallback = 0) {
54
+ return Number.isFinite(value) ? value : fallback;
55
+ }
56
+ var SENTENCE_ENDINGS = /[。!?;\n\r]/;
57
+ var TRAILING_QUOTES = /[」』""”)】》'']/;
58
+ function splitSentences(text, minLength) {
59
+ const sentences = [];
60
+ const paragraphs = text.split(/[\n\r]+/).filter((p) => p.trim().length > 0);
61
+ let globalIndex = 0;
62
+ for (let pi = 0; pi < paragraphs.length; pi++) {
63
+ const para = paragraphs[pi].trim();
64
+ if (para.length === 0) continue;
65
+ const rawSentences = [];
66
+ let current = "";
67
+ for (let i = 0; i < para.length; i++) {
68
+ current += para[i];
69
+ if (SENTENCE_ENDINGS.test(para[i])) {
70
+ while (i + 1 < para.length && TRAILING_QUOTES.test(para[i + 1])) {
71
+ i++;
72
+ current += para[i];
73
+ }
74
+ const trimmed = current.trim();
75
+ if (trimmed.length > 0) {
76
+ rawSentences.push(trimmed);
77
+ }
78
+ current = "";
79
+ }
80
+ }
81
+ if (current.trim().length > 0) {
82
+ rawSentences.push(current.trim());
83
+ }
84
+ for (let si = 0; si < rawSentences.length; si++) {
85
+ const s = rawSentences[si];
86
+ if (s.length < minLength) continue;
87
+ sentences.push({
88
+ index: globalIndex++,
89
+ text: s,
90
+ paragraphIndex: pi,
91
+ sentenceInParagraph: si,
92
+ isParagraphStart: si === 0,
93
+ isParagraphEnd: si === rawSentences.length - 1,
94
+ isFirstParagraph: pi === 0,
95
+ score: 0
96
+ });
97
+ }
98
+ }
99
+ return sentences;
100
+ }
101
+ function extractNgrams(text, n) {
102
+ const ngrams = /* @__PURE__ */ new Set();
103
+ if (!text || text.length === 0) return ngrams;
104
+ const cleaned = text.replace(/[^\u4e00-\u9fff\u3400-\u4dbf\u3007a-zA-Z0-9]/g, "");
105
+ if (cleaned.length < n) return ngrams;
106
+ for (let i = 0; i <= cleaned.length - n; i++) {
107
+ ngrams.add(cleaned.substring(i, i + n));
108
+ }
109
+ return ngrams;
110
+ }
111
+ var STOP_CHARS = new Set("\u7684\u4E86\u662F\u5728\u6211\u6709\u548C\u5C31\u4E0D\u4EBA\u90FD\u4E00\u8FD9\u4E2D\u5927\u4E3A\u4E0A\u4E2A\u56FD\u4E5F\u5B50\u65F6\u9053\u8BF4\u51FA\u4F1A\u8981\u6CA1\u6210\u597D\u80FD\u5BF9\u7136\u5979\u8FC7\u751F\u91CC\u540E\u4EE5\u5230\u53BB\u80FD\u5F97\u7740\u5E74\u8FD9".split(""));
112
+ function extractKeywords(sentences, topK) {
113
+ if (sentences.length === 0) return /* @__PURE__ */ new Set();
114
+ const tf = /* @__PURE__ */ new Map();
115
+ const df = /* @__PURE__ */ new Map();
116
+ for (const sent of sentences) {
117
+ const cleaned = sent.text.replace(/[^\u4e00-\u9fff\u3400-\u4dbf]/g, "");
118
+ const seen = /* @__PURE__ */ new Set();
119
+ for (const ch of cleaned) {
120
+ if (STOP_CHARS.has(ch)) continue;
121
+ tf.set(ch, (tf.get(ch) || 0) + 1);
122
+ if (!seen.has(ch)) {
123
+ df.set(ch, (df.get(ch) || 0) + 1);
124
+ seen.add(ch);
125
+ }
126
+ }
127
+ }
128
+ const n = sentences.length;
129
+ const tfidf = /* @__PURE__ */ new Map();
130
+ for (const [ch, freq] of tf) {
131
+ const docFreq = df.get(ch) || 1;
132
+ const idf = Math.log((n + 1) / (docFreq + 1)) + 1;
133
+ tfidf.set(ch, freq * idf);
134
+ }
135
+ const sorted = [...tfidf.entries()].sort((a, b) => b[1] - a[1]).slice(0, topK);
136
+ return new Set(sorted.map(([ch]) => ch));
137
+ }
138
+ function keywordWeightForSentence(sentence, keywords, keywordWeight) {
139
+ if (keywords.size === 0 || keywordWeight === 0) return 1;
140
+ const cleaned = sentence.text.replace(/[^\u4e00-\u9fff\u3400-\u4dbf]/g, "");
141
+ let hits = 0;
142
+ for (const ch of cleaned) {
143
+ if (keywords.has(ch)) hits++;
144
+ }
145
+ if (hits === 0) return 1;
146
+ const ratio = hits / keywords.size;
147
+ return Math.pow(keywordWeight, Math.min(ratio, 1));
148
+ }
149
+ function similarity(ngramsA, ngramsB) {
150
+ if (ngramsA.size === 0 || ngramsB.size === 0) return 0;
151
+ let intersection = 0;
152
+ for (const gram of ngramsA) {
153
+ if (ngramsB.has(gram)) intersection++;
154
+ }
155
+ if (intersection === 0) return 0;
156
+ const denom = Math.log(ngramsA.size + 1) + Math.log(ngramsB.size + 1);
157
+ if (denom === 0) return 0;
158
+ return safeNumber(intersection / denom);
159
+ }
160
+ function positionWeight(sentence, opts) {
161
+ let weight = 1;
162
+ if (sentence.isFirstParagraph && sentence.isParagraphStart) {
163
+ weight *= opts.weightFirstSentence;
164
+ } else if (sentence.isFirstParagraph) {
165
+ weight *= opts.weightFirstParagraph;
166
+ }
167
+ if (!sentence.isFirstParagraph && sentence.isParagraphStart) {
168
+ weight *= opts.weightParagraphStart;
169
+ }
170
+ if (sentence.isParagraphEnd) {
171
+ weight *= opts.weightParagraphEnd;
172
+ }
173
+ return safeNumber(weight, 1);
174
+ }
175
+ function textRank(sentences, ngramSets, opts, keywords) {
176
+ const n = sentences.length;
177
+ if (n === 0) return;
178
+ if (n === 1) {
179
+ let score = positionWeight(sentences[0], opts);
180
+ if (keywords && opts.keywordWeight > 0) {
181
+ score *= keywordWeightForSentence(sentences[0], keywords, opts.keywordWeight);
182
+ }
183
+ sentences[0].score = safeNumber(score, 1);
184
+ return;
185
+ }
186
+ const simMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
187
+ const simSums = new Array(n).fill(0);
188
+ for (let i = 0; i < n; i++) {
189
+ for (let j = i + 1; j < n; j++) {
190
+ const sim = similarity(ngramSets[i], ngramSets[j]);
191
+ simMatrix[i][j] = sim;
192
+ simMatrix[j][i] = sim;
193
+ simSums[i] += sim;
194
+ simSums[j] += sim;
195
+ }
196
+ }
197
+ const initScores = sentences.map((s) => {
198
+ let score = positionWeight(s, opts);
199
+ if (keywords && opts.keywordWeight > 0) {
200
+ score *= keywordWeightForSentence(s, keywords, opts.keywordWeight);
201
+ }
202
+ return safeNumber(score, 1);
203
+ });
204
+ const scores = [...initScores];
205
+ for (let iter = 0; iter < opts.maxIterations; iter++) {
206
+ const newScores = new Array(n).fill(0);
207
+ let maxDiff = 0;
208
+ for (let i = 0; i < n; i++) {
209
+ let rankSum = 0;
210
+ for (let j = 0; j < n; j++) {
211
+ if (i === j || simSums[j] === 0) continue;
212
+ rankSum += simMatrix[i][j] / simSums[j] * scores[j];
213
+ }
214
+ newScores[i] = (1 - opts.dampingFactor) * initScores[i] + opts.dampingFactor * rankSum;
215
+ newScores[i] = safeNumber(newScores[i], initScores[i]);
216
+ const diff = Math.abs(newScores[i] - scores[i]);
217
+ if (diff > maxDiff) maxDiff = diff;
218
+ }
219
+ for (let i = 0; i < n; i++) {
220
+ scores[i] = newScores[i];
221
+ }
222
+ if (maxDiff < opts.convergenceThreshold) break;
223
+ }
224
+ for (let i = 0; i < n; i++) {
225
+ sentences[i].score = safeNumber(scores[i], initScores[i]);
226
+ }
227
+ }
228
+ function sentenceCountForLevel(totalSentences, level) {
229
+ if (level === 5) return totalSentences;
230
+ if (level === 4) return Math.max(3, Math.ceil(totalSentences * 0.5));
231
+ if (level === 3) return Math.max(3, Math.ceil(totalSentences * 0.3));
232
+ if (level === 2) return Math.max(2, Math.ceil(totalSentences * 0.2));
233
+ return 1;
234
+ }
235
+ function selectByMMR(sentences, ngramSets, targetCount, lambda) {
236
+ if (sentences.length <= targetCount) return [...sentences];
237
+ const scores = sentences.map((s) => s.score);
238
+ const maxScore = Math.max(...scores);
239
+ const minScore = Math.min(...scores);
240
+ const scoreRange = maxScore - minScore || 1;
241
+ const normalizedScores = scores.map((s) => (s - minScore) / scoreRange);
242
+ const selected = [];
243
+ const remaining = new Set(sentences.map((_, i) => i));
244
+ let bestIdx = -1;
245
+ let bestScore = -Infinity;
246
+ for (let i = 0; i < sentences.length; i++) {
247
+ if (normalizedScores[i] > bestScore) {
248
+ bestScore = normalizedScores[i];
249
+ bestIdx = i;
250
+ }
251
+ }
252
+ selected.push(bestIdx);
253
+ remaining.delete(bestIdx);
254
+ while (selected.length < targetCount && remaining.size > 0) {
255
+ let bestMMR = -Infinity;
256
+ let bestCandidate = -1;
257
+ for (const idx of remaining) {
258
+ const relevance = lambda * normalizedScores[idx];
259
+ let maxSim = 0;
260
+ for (const selIdx of selected) {
261
+ const sim = similarity(ngramSets[idx], ngramSets[selIdx]);
262
+ if (sim > maxSim) maxSim = sim;
263
+ }
264
+ const diversity = (1 - lambda) * maxSim;
265
+ const mmr = relevance - diversity;
266
+ if (mmr > bestMMR) {
267
+ bestMMR = mmr;
268
+ bestCandidate = idx;
269
+ }
270
+ }
271
+ if (bestCandidate === -1) break;
272
+ selected.push(bestCandidate);
273
+ remaining.delete(bestCandidate);
274
+ }
275
+ return selected.map((idx) => sentences[idx]);
276
+ }
277
+ var CLAUSE_SEPARATORS = /[,、,;;::]/;
278
+ function splitClauses(sentence, minLength) {
279
+ if (!sentence || sentence.length === 0) return [];
280
+ const clauses = [];
281
+ let current = "";
282
+ for (let i = 0; i < sentence.length; i++) {
283
+ current += sentence[i];
284
+ if (CLAUSE_SEPARATORS.test(sentence[i])) {
285
+ const trimmed = current.trim();
286
+ if (trimmed.length >= minLength) {
287
+ clauses.push(trimmed);
288
+ }
289
+ current = "";
290
+ }
291
+ }
292
+ const remaining = current.trim();
293
+ if (remaining.length >= minLength) {
294
+ clauses.push(remaining);
295
+ }
296
+ return clauses;
297
+ }
298
+ var MAIN_CLAUSE_BOOST = 1.3;
299
+ function clauseTextRank(clauses, ngramSets, opts) {
300
+ const n = clauses.length;
301
+ if (n === 0) return;
302
+ if (n === 1) {
303
+ clauses[0].score = safeNumber(clauses[0].score, 1);
304
+ return;
305
+ }
306
+ const simMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
307
+ const simSums = new Array(n).fill(0);
308
+ for (let i = 0; i < n; i++) {
309
+ for (let j = i + 1; j < n; j++) {
310
+ const sim = similarity(ngramSets[i], ngramSets[j]);
311
+ simMatrix[i][j] = sim;
312
+ simMatrix[j][i] = sim;
313
+ simSums[i] += sim;
314
+ simSums[j] += sim;
315
+ }
316
+ }
317
+ const initScores = clauses.map((c) => {
318
+ const base = c.score || 1;
319
+ return c.isMainClause ? base * MAIN_CLAUSE_BOOST : base;
320
+ });
321
+ const scores = [...initScores];
322
+ for (let iter = 0; iter < opts.maxIterations; iter++) {
323
+ const newScores = new Array(n).fill(0);
324
+ let maxDiff = 0;
325
+ for (let i = 0; i < n; i++) {
326
+ let rankSum = 0;
327
+ for (let j = 0; j < n; j++) {
328
+ if (i === j || simSums[j] === 0) continue;
329
+ rankSum += simMatrix[i][j] / simSums[j] * scores[j];
330
+ }
331
+ newScores[i] = (1 - opts.dampingFactor) * initScores[i] + opts.dampingFactor * rankSum;
332
+ newScores[i] = safeNumber(newScores[i], initScores[i]);
333
+ const diff = Math.abs(newScores[i] - scores[i]);
334
+ if (diff > maxDiff) maxDiff = diff;
335
+ }
336
+ for (let i = 0; i < n; i++) {
337
+ scores[i] = newScores[i];
338
+ }
339
+ if (maxDiff < opts.convergenceThreshold) break;
340
+ }
341
+ for (let i = 0; i < n; i++) {
342
+ clauses[i].score = safeNumber(scores[i], initScores[i]);
343
+ }
344
+ }
345
+ function rerankSentences(sentences, targetCount, opts, keywords) {
346
+ const ngramSets = sentences.map((s) => extractNgrams(s.text, opts.ngramSize));
347
+ textRank(sentences, ngramSets, opts, keywords);
348
+ const firstPassCount = Math.max(5, targetCount * 2);
349
+ const firstPass = [...sentences].sort((a, b) => b.score - a.score).slice(0, Math.min(firstPassCount, sentences.length));
350
+ if (firstPass.length <= targetCount) {
351
+ return firstPass;
352
+ }
353
+ const secondPassSentences = firstPass.map((s, i) => ({
354
+ ...s,
355
+ index: i
356
+ }));
357
+ const secondNgramSets = secondPassSentences.map(
358
+ (s) => extractNgrams(s.text, opts.ngramSize)
359
+ );
360
+ textRank(secondPassSentences, secondNgramSets, opts);
361
+ const result = [...secondPassSentences].sort((a, b) => b.score - a.score).slice(0, targetCount);
362
+ return result.map((s) => {
363
+ const original = firstPass.find((f) => f.text === s.text);
364
+ return original || s;
365
+ });
366
+ }
367
+ var SENTENCE_FINAL_PUNCT = /[。!?;!?;]$/;
368
+ function joinClauses(clauses) {
369
+ if (clauses.length === 0) return "";
370
+ if (clauses.length === 1) return clauses[0];
371
+ let result = clauses[0];
372
+ for (let i = 1; i < clauses.length; i++) {
373
+ const prev = result;
374
+ const curr = clauses[i];
375
+ const lastChar = prev[prev.length - 1];
376
+ if (SENTENCE_FINAL_PUNCT.test(prev)) {
377
+ result = result + " " + curr;
378
+ } else if (CLAUSE_SEPARATORS.test(lastChar)) {
379
+ result = result + curr;
380
+ } else {
381
+ result = result + "\uFF0C" + curr;
382
+ }
383
+ }
384
+ return result;
385
+ }
386
+ var CLAUSE_CONJUNCTIONS = [
387
+ // 转折
388
+ "\u4F46",
389
+ "\u4F46\u662F",
390
+ "\u7136\u800C",
391
+ "\u4E0D\u8FC7",
392
+ "\u53EF\u662F",
393
+ "\u5374",
394
+ "\u53CD\u800C",
395
+ "\u53EA\u662F",
396
+ // 递进/并列
397
+ "\u4E5F",
398
+ "\u800C\u4E14",
399
+ "\u5E76\u4E14",
400
+ "\u6B64\u5916",
401
+ "\u53E6\u5916",
402
+ "\u540C\u65F6",
403
+ "\u65E2",
404
+ "\u53C8",
405
+ // 因果
406
+ "\u56E0\u6B64",
407
+ "\u6240\u4EE5",
408
+ "\u4E8E\u662F",
409
+ "\u4ECE\u800C",
410
+ "\u8FDB\u800C",
411
+ "\u6545",
412
+ // 让步
413
+ "\u867D\u7136",
414
+ "\u5C3D\u7BA1",
415
+ "\u56FA\u7136",
416
+ "\u5373\u4F7F",
417
+ "\u5C31\u7B97",
418
+ // 条件
419
+ "\u5982\u679C",
420
+ "\u5047\u5982",
421
+ "\u53EA\u8981",
422
+ "\u53EA\u6709",
423
+ "\u9664\u975E",
424
+ // 总结
425
+ "\u603B\u4E4B",
426
+ "\u7EFC\u4E0A",
427
+ "\u7531\u6B64\u53EF\u89C1",
428
+ "\u603B\u7684\u6765\u8BF4"
429
+ ];
430
+ var MAX_CONJ_LEN = Math.max(...CLAUSE_CONJUNCTIONS.map((c) => c.length));
431
+ function processConjunctions(orderedClauses, allClauses) {
432
+ const clauseMap = /* @__PURE__ */ new Map();
433
+ for (const c of allClauses) {
434
+ clauseMap.set(c.sourceSentenceIndex + ":" + c.clauseIndex, c);
435
+ }
436
+ const selectedSet = new Set(orderedClauses.map((c) => c.sourceSentenceIndex + ":" + c.clauseIndex));
437
+ const results = [];
438
+ for (const clause of orderedClauses) {
439
+ let text = clause.text;
440
+ const matchedConj = matchConjunction(text);
441
+ if (matchedConj) {
442
+ const prevKey = clause.sourceSentenceIndex + ":" + (clause.clauseIndex - 1);
443
+ const prevClause = clauseMap.get(prevKey);
444
+ if (prevClause && selectedSet.has(prevKey)) {
445
+ } else {
446
+ text = text.substring(matchedConj.length);
447
+ text = text.replace(/^[,、,;;::\s]+/, "");
448
+ }
449
+ }
450
+ if (text.length > 0) {
451
+ results.push(text);
452
+ }
453
+ }
454
+ return results;
455
+ }
456
+ function matchConjunction(text) {
457
+ for (let len = MAX_CONJ_LEN; len >= 1; len--) {
458
+ if (len > text.length) continue;
459
+ const prefix = text.substring(0, len);
460
+ if (CLAUSE_CONJUNCTIONS.includes(prefix)) {
461
+ return prefix;
462
+ }
463
+ }
464
+ return null;
465
+ }
466
+ function extractClauseSummary(sentences, opts, keywords) {
467
+ const ngramSets = sentences.map((s) => extractNgrams(s.text, opts.ngramSize));
468
+ textRank(sentences, ngramSets, opts, keywords);
469
+ const topSentenceCount = Math.min(5, sentences.length);
470
+ const topSentences = [...sentences].sort((a, b) => b.score - a.score).slice(0, topSentenceCount);
471
+ const allClauses = [];
472
+ for (const sent of topSentences) {
473
+ const rawClauses = splitClauses(sent.text, opts.minClauseLength);
474
+ for (let ci = 0; ci < rawClauses.length; ci++) {
475
+ allClauses.push({
476
+ text: rawClauses[ci],
477
+ sourceSentenceIndex: sent.index,
478
+ clauseIndex: ci,
479
+ isMainClause: ci === 0,
480
+ // 第一个子句是主干子句
481
+ score: sent.score
482
+ });
483
+ }
484
+ }
485
+ if (allClauses.length === 0) {
486
+ const best = topSentences[0];
487
+ return {
488
+ summary: [best.text],
489
+ clauses: [{
490
+ text: best.text,
491
+ sourceSentenceIndex: best.index,
492
+ clauseIndex: 0,
493
+ isMainClause: true,
494
+ score: best.score
495
+ }]
496
+ };
497
+ }
498
+ const clauseNgramSets = allClauses.map(
499
+ (c) => extractNgrams(c.text, opts.ngramSize)
500
+ );
501
+ clauseTextRank(allClauses, clauseNgramSets, opts);
502
+ const maxClauses = Math.min(opts.maxClauses, allClauses.length);
503
+ const topClauses = [...allClauses].sort((a, b) => b.score - a.score).slice(0, maxClauses);
504
+ const orderedClauses = [...topClauses].sort((a, b) => {
505
+ if (a.sourceSentenceIndex !== b.sourceSentenceIndex) {
506
+ return a.sourceSentenceIndex - b.sourceSentenceIndex;
507
+ }
508
+ return a.clauseIndex - b.clauseIndex;
509
+ });
510
+ const processedTexts = processConjunctions(orderedClauses, allClauses);
511
+ const summaryText = joinClauses(processedTexts);
512
+ return {
513
+ summary: [summaryText],
514
+ clauses: allClauses
515
+ };
516
+ }
517
+ function extractSummary(text, options = {}) {
518
+ if (text === null || text === void 0) {
519
+ return { summary: [], sentences: [], text: "", compressionLevel: 3 };
520
+ }
521
+ if (typeof text !== "string") {
522
+ try {
523
+ text = String(text);
524
+ } catch {
525
+ return { summary: [], sentences: [], text: "", compressionLevel: 3 };
526
+ }
527
+ }
528
+ text = text.replace(/[\uFEFF\u200B\u200C\u200D\u00AD]/g, "");
529
+ text = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
530
+ if (text.trim().length === 0) {
531
+ return { summary: [], sentences: [], text: "", compressionLevel: 3 };
532
+ }
533
+ const opts = sanitizeOptions(options);
534
+ const level = opts.compressionLevel;
535
+ const sentences = splitSentences(text, opts.minSentenceLength);
536
+ if (sentences.length === 0) {
537
+ return { summary: [], sentences: [], text: "", compressionLevel: level };
538
+ }
539
+ const keywords = opts.keywordWeight > 0 ? extractKeywords(sentences, Math.min(20, Math.max(5, Math.ceil(sentences.length * 0.5)))) : void 0;
540
+ if (level === 5) {
541
+ const ngramSets2 = sentences.map((s) => extractNgrams(s.text, opts.ngramSize));
542
+ textRank(sentences, ngramSets2, opts, keywords);
543
+ const summary2 = sentences.map((s) => s.text);
544
+ return { summary: summary2, sentences, text: summary2.join(" "), compressionLevel: level };
545
+ }
546
+ if (level === 1) {
547
+ if (sentences.length <= 1) {
548
+ const summary3 = sentences.map((s) => s.text);
549
+ return { summary: summary3, sentences, text: summary3.join(" "), compressionLevel: level };
550
+ }
551
+ const { summary: summary2, clauses } = extractClauseSummary(sentences, opts, keywords);
552
+ return { summary: summary2, sentences, text: summary2.join(" "), compressionLevel: level, clauses };
553
+ }
554
+ let targetCount;
555
+ if (options.sentenceCount !== void 0 && options.compressionLevel === void 0) {
556
+ targetCount = opts.sentenceCount;
557
+ } else {
558
+ targetCount = sentenceCountForLevel(sentences.length, level);
559
+ }
560
+ if (sentences.length <= targetCount) {
561
+ const summary2 = sentences.map((s) => s.text);
562
+ return { summary: summary2, sentences, text: summary2.join(" "), compressionLevel: level };
563
+ }
564
+ if (level === 2) {
565
+ const topSentences = rerankSentences(sentences, targetCount, opts, keywords);
566
+ const selectedIndices2 = new Set(topSentences.map((s) => s.index));
567
+ const summary2 = sentences.filter((s) => selectedIndices2.has(s.index)).map((s) => s.text);
568
+ const ngramSets2 = sentences.map((s) => extractNgrams(s.text, opts.ngramSize));
569
+ textRank(sentences, ngramSets2, opts, keywords);
570
+ return { summary: summary2, sentences, text: summary2.join(" "), compressionLevel: level };
571
+ }
572
+ const ngramSets = sentences.map((s) => extractNgrams(s.text, opts.ngramSize));
573
+ textRank(sentences, ngramSets, opts, keywords);
574
+ const topN = selectByMMR(sentences, ngramSets, targetCount, opts.mmrLambda);
575
+ const selectedIndices = new Set(topN.map((s) => s.index));
576
+ const summary = sentences.filter((s) => selectedIndices.has(s.index)).map((s) => s.text);
577
+ return { summary, sentences, text: summary.join(" "), compressionLevel: level };
578
+ }
579
+ function rankSentences(text, options = {}) {
580
+ if (text === null || text === void 0 || typeof text !== "string") {
581
+ return [];
582
+ }
583
+ if (text.trim().length === 0) return [];
584
+ const opts = sanitizeOptions(options);
585
+ const sentences = splitSentences(text, opts.minSentenceLength);
586
+ if (sentences.length === 0) return [];
587
+ const keywords = opts.keywordWeight > 0 ? extractKeywords(sentences, Math.min(20, Math.max(5, Math.ceil(sentences.length * 0.5)))) : void 0;
588
+ const ngramSets = sentences.map((s) => extractNgrams(s.text, opts.ngramSize));
589
+ textRank(sentences, ngramSets, opts, keywords);
590
+ return [...sentences].sort((a, b) => b.score - a.score);
591
+ }
592
+ export {
593
+ extractSummary,
594
+ rankSentences
595
+ };
596
+ //# sourceMappingURL=chinese-summary.mjs.map