@daviddh/llm-markdown-whatsapp 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +17 -0
- package/CLAUDE.md +155 -0
- package/README.md +304 -0
- package/eslint.config.mjs +28 -0
- package/jest.config.js +40 -0
- package/package.json +61 -0
- package/packages/core/dist/__tests__/splitChatText.basic.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.basic.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.basic.test.js +100 -0
- package/packages/core/dist/__tests__/splitChatText.coverageLists.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.coverageLists.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.coverageLists.test.js +88 -0
- package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.js +108 -0
- package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.js +74 -0
- package/packages/core/dist/__tests__/splitChatText.dataProtection.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.dataProtection.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.dataProtection.test.js +80 -0
- package/packages/core/dist/__tests__/splitChatText.dataTests1.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.dataTests1.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.dataTests1.test.js +124 -0
- package/packages/core/dist/__tests__/splitChatText.dataTests2.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.dataTests2.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.dataTests2.test.js +122 -0
- package/packages/core/dist/__tests__/splitChatText.edgeCases.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.edgeCases.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.edgeCases.test.js +132 -0
- package/packages/core/dist/__tests__/splitChatText.helpers.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.helpers.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.helpers.js +5 -0
- package/packages/core/dist/__tests__/splitChatText.punctuation.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.punctuation.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.punctuation.test.js +98 -0
- package/packages/core/dist/__tests__/splitChatText.realWorld.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.realWorld.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.realWorld.test.js +104 -0
- package/packages/core/dist/__tests__/splitChatText.urlProtection.test.d.ts +2 -0
- package/packages/core/dist/__tests__/splitChatText.urlProtection.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/splitChatText.urlProtection.test.js +82 -0
- package/packages/core/dist/__tests__/strs.splitChatText.test.d.ts +2 -0
- package/packages/core/dist/__tests__/strs.splitChatText.test.d.ts.map +1 -0
- package/packages/core/dist/__tests__/strs.splitChatText.test.js +992 -0
- package/packages/core/dist/chatSplit/breakProcessor.d.ts +4 -0
- package/packages/core/dist/chatSplit/breakProcessor.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/breakProcessor.js +67 -0
- package/packages/core/dist/chatSplit/constants.d.ts +35 -0
- package/packages/core/dist/chatSplit/constants.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/constants.js +34 -0
- package/packages/core/dist/chatSplit/index.d.ts +2 -0
- package/packages/core/dist/chatSplit/index.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/index.js +1 -0
- package/packages/core/dist/chatSplit/listNormalization.d.ts +13 -0
- package/packages/core/dist/chatSplit/listNormalization.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/listNormalization.js +140 -0
- package/packages/core/dist/chatSplit/listProcessor.d.ts +6 -0
- package/packages/core/dist/chatSplit/listProcessor.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/listProcessor.js +61 -0
- package/packages/core/dist/chatSplit/mergeProcessor.d.ts +3 -0
- package/packages/core/dist/chatSplit/mergeProcessor.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/mergeProcessor.js +88 -0
- package/packages/core/dist/chatSplit/paragraphProcessor.d.ts +14 -0
- package/packages/core/dist/chatSplit/paragraphProcessor.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/paragraphProcessor.js +66 -0
- package/packages/core/dist/chatSplit/periodProcessor.d.ts +4 -0
- package/packages/core/dist/chatSplit/periodProcessor.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/periodProcessor.js +110 -0
- package/packages/core/dist/chatSplit/positionHelpers.d.ts +12 -0
- package/packages/core/dist/chatSplit/positionHelpers.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/positionHelpers.js +57 -0
- package/packages/core/dist/chatSplit/productCardProcessor.d.ts +12 -0
- package/packages/core/dist/chatSplit/productCardProcessor.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/productCardProcessor.js +138 -0
- package/packages/core/dist/chatSplit/punctuationNormalization.d.ts +5 -0
- package/packages/core/dist/chatSplit/punctuationNormalization.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/punctuationNormalization.js +103 -0
- package/packages/core/dist/chatSplit/questionProcessor.d.ts +6 -0
- package/packages/core/dist/chatSplit/questionProcessor.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/questionProcessor.js +212 -0
- package/packages/core/dist/chatSplit/sections.d.ts +23 -0
- package/packages/core/dist/chatSplit/sections.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/sections.js +153 -0
- package/packages/core/dist/chatSplit/splitChatText.d.ts +6 -0
- package/packages/core/dist/chatSplit/splitChatText.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/splitChatText.js +119 -0
- package/packages/core/dist/chatSplit/splitConstants.d.ts +3 -0
- package/packages/core/dist/chatSplit/splitConstants.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/splitConstants.js +2 -0
- package/packages/core/dist/chatSplit/splitProcessors.d.ts +22 -0
- package/packages/core/dist/chatSplit/splitProcessors.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/splitProcessors.js +105 -0
- package/packages/core/dist/chatSplit/textHelpers.d.ts +27 -0
- package/packages/core/dist/chatSplit/textHelpers.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/textHelpers.js +77 -0
- package/packages/core/dist/chatSplit/urlNormalization.d.ts +7 -0
- package/packages/core/dist/chatSplit/urlNormalization.d.ts.map +1 -0
- package/packages/core/dist/chatSplit/urlNormalization.js +13 -0
- package/packages/core/dist/index.d.ts +2 -0
- package/packages/core/dist/index.d.ts.map +1 -0
- package/packages/core/dist/index.js +1 -0
- package/packages/core/jest.config.js +23 -0
- package/packages/core/package.json +38 -0
- package/packages/core/src/__tests__/splitChatText.basic.test.ts +123 -0
- package/packages/core/src/__tests__/splitChatText.coverageLists.test.ts +108 -0
- package/packages/core/src/__tests__/splitChatText.coverageProcessors.test.ts +172 -0
- package/packages/core/src/__tests__/splitChatText.coverageQuestions.test.ts +95 -0
- package/packages/core/src/__tests__/splitChatText.dataProtection.test.ts +96 -0
- package/packages/core/src/__tests__/splitChatText.dataTests1.test.ts +137 -0
- package/packages/core/src/__tests__/splitChatText.dataTests2.test.ts +134 -0
- package/packages/core/src/__tests__/splitChatText.edgeCases.test.ts +157 -0
- package/packages/core/src/__tests__/splitChatText.helpers.ts +6 -0
- package/packages/core/src/__tests__/splitChatText.punctuation.test.ts +113 -0
- package/packages/core/src/__tests__/splitChatText.realWorld.test.ts +118 -0
- package/packages/core/src/__tests__/splitChatText.urlProtection.test.ts +102 -0
- package/packages/core/src/chatSplit/breakProcessor.ts +103 -0
- package/packages/core/src/chatSplit/constants.ts +50 -0
- package/packages/core/src/chatSplit/index.ts +1 -0
- package/packages/core/src/chatSplit/listNormalization.ts +189 -0
- package/packages/core/src/chatSplit/listProcessor.ts +74 -0
- package/packages/core/src/chatSplit/mergeProcessor.ts +124 -0
- package/packages/core/src/chatSplit/paragraphProcessor.ts +86 -0
- package/packages/core/src/chatSplit/periodProcessor.ts +148 -0
- package/packages/core/src/chatSplit/positionHelpers.ts +66 -0
- package/packages/core/src/chatSplit/productCardProcessor.ts +184 -0
- package/packages/core/src/chatSplit/punctuationNormalization.ts +142 -0
- package/packages/core/src/chatSplit/questionProcessor.ts +298 -0
- package/packages/core/src/chatSplit/sections.ts +243 -0
- package/packages/core/src/chatSplit/splitChatText.ts +156 -0
- package/packages/core/src/chatSplit/splitConstants.ts +2 -0
- package/packages/core/src/chatSplit/splitProcessors.ts +153 -0
- package/packages/core/src/chatSplit/textHelpers.ts +86 -0
- package/packages/core/src/chatSplit/urlNormalization.ts +17 -0
- package/packages/core/src/index.ts +1 -0
- package/packages/core/tsconfig.build.json +4 -0
- package/packages/core/tsconfig.json +25 -0
- package/tsconfig.json +19 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import { COMBINED_LENGTH_THRESHOLD, DOUBLE_NEWLINE_DISTANCE_THRESHOLD, INDEX_OFFSET, LONG_QUESTION_THRESHOLD, SHORT_INTRO_THRESHOLD, } from './constants.js';
|
|
2
|
+
import { isPositionInBulletLine, isPositionInsideParentheses } from './positionHelpers.js';
|
|
3
|
+
import { findPositionAfterEmoji, hasTextContent, isParentheticalClarification, smartTrim, startsWithEmoji, startsWithLowercase, } from './textHelpers.js';
|
|
4
|
+
/** Contiguous questions text length threshold */
|
|
5
|
+
const CONTIGUOUS_QUESTIONS_TEXT_THRESHOLD = 50;
|
|
6
|
+
/** Zero constant */
|
|
7
|
+
const ZERO = 0;
|
|
8
|
+
/** Not found constant */
|
|
9
|
+
const NOT_FOUND = -1;
|
|
10
|
+
/** Double newline offset */
|
|
11
|
+
const DOUBLE_NEWLINE_OFFSET = 2;
|
|
12
|
+
/** Increment constant */
|
|
13
|
+
const INCREMENT = 1;
|
|
14
|
+
/** Get first element of regex match */
|
|
15
|
+
const getFirstMatch = (match) => {
|
|
16
|
+
const [firstElement] = match;
|
|
17
|
+
return firstElement;
|
|
18
|
+
};
|
|
19
|
+
/** Get last element of array */
|
|
20
|
+
const getLastElement = (arr) => {
|
|
21
|
+
const { length } = arr;
|
|
22
|
+
return length > ZERO ? arr[length - INDEX_OFFSET] : undefined;
|
|
23
|
+
};
|
|
24
|
+
/**
|
|
25
|
+
* Handles long question splitting
|
|
26
|
+
*/
|
|
27
|
+
function handleLongQuestion(remainingText, chunks, questionPart, afterQuestion) {
|
|
28
|
+
if (startsWithLowercase(afterQuestion)) {
|
|
29
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
30
|
+
}
|
|
31
|
+
if (startsWithEmoji(afterQuestion)) {
|
|
32
|
+
const emojiEndPos = findPositionAfterEmoji(afterQuestion);
|
|
33
|
+
const emojiPart = afterQuestion.substring(ZERO, emojiEndPos);
|
|
34
|
+
const textAfterEmoji = afterQuestion.substring(emojiEndPos).trim();
|
|
35
|
+
if (textAfterEmoji.length > ZERO) {
|
|
36
|
+
chunks.push(`${questionPart} ${emojiPart}`);
|
|
37
|
+
return { splitFound: true, newRemainingText: textAfterEmoji };
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
else {
|
|
41
|
+
chunks.push(questionPart);
|
|
42
|
+
return { splitFound: true, newRemainingText: afterQuestion };
|
|
43
|
+
}
|
|
44
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Handles short question splitting
|
|
48
|
+
*/
|
|
49
|
+
function handleShortQuestion(remainingText, chunks, questionPart, afterQuestion) {
|
|
50
|
+
const firstPeriodIndex = afterQuestion.indexOf('.');
|
|
51
|
+
if (firstPeriodIndex !== NOT_FOUND && firstPeriodIndex < afterQuestion.length - INDEX_OFFSET) {
|
|
52
|
+
const untilFirstPeriod = afterQuestion.substring(ZERO, firstPeriodIndex + INDEX_OFFSET);
|
|
53
|
+
const combinedLength = questionPart.length + INDEX_OFFSET + untilFirstPeriod.length;
|
|
54
|
+
if (combinedLength <= COMBINED_LENGTH_THRESHOLD) {
|
|
55
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
if (startsWithLowercase(afterQuestion)) {
|
|
59
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
60
|
+
}
|
|
61
|
+
if (startsWithEmoji(afterQuestion)) {
|
|
62
|
+
const emojiEndPos = findPositionAfterEmoji(afterQuestion);
|
|
63
|
+
const emojiPart = afterQuestion.substring(ZERO, emojiEndPos);
|
|
64
|
+
const textAfterEmoji = afterQuestion.substring(emojiEndPos).trim();
|
|
65
|
+
if (textAfterEmoji.length > ZERO) {
|
|
66
|
+
chunks.push(`${questionPart} ${emojiPart}`);
|
|
67
|
+
return { splitFound: true, newRemainingText: textAfterEmoji };
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
chunks.push(questionPart);
|
|
72
|
+
return { splitFound: true, newRemainingText: afterQuestion };
|
|
73
|
+
}
|
|
74
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
75
|
+
}
|
|
76
|
+
/** Handle parenthetical clarification split */
|
|
77
|
+
function handleParentheticalSplit(ctx, chunks) {
|
|
78
|
+
const parentheticalMatch = /^\([^\)]+\)\?/v.exec(ctx.afterQuestion);
|
|
79
|
+
if (parentheticalMatch === null)
|
|
80
|
+
return null;
|
|
81
|
+
const firstMatch = getFirstMatch(parentheticalMatch);
|
|
82
|
+
const { length: parentheticalLength } = firstMatch;
|
|
83
|
+
const afterParenthetical = smartTrim(ctx.afterQuestion.substring(parentheticalLength));
|
|
84
|
+
if (!hasTextContent(afterParenthetical))
|
|
85
|
+
return null;
|
|
86
|
+
const beforePart = ctx.remainingText.substring(ZERO, ctx.questionIndex + INDEX_OFFSET + ctx.afterQuestionRaw.indexOf(firstMatch) + parentheticalLength);
|
|
87
|
+
chunks.push(beforePart);
|
|
88
|
+
return { splitFound: true, newRemainingText: afterParenthetical };
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Processes a single question
|
|
92
|
+
*/
|
|
93
|
+
function processSingleQuestion(remainingText, chunks, questionIndex) {
|
|
94
|
+
if (questionIndex >= remainingText.length - INDEX_OFFSET) {
|
|
95
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
96
|
+
}
|
|
97
|
+
const afterQuestionRaw = remainingText.substring(questionIndex + INDEX_OFFSET);
|
|
98
|
+
const afterQuestion = smartTrim(afterQuestionRaw);
|
|
99
|
+
if (isParentheticalClarification(afterQuestion)) {
|
|
100
|
+
const ctx = { remainingText, questionIndex, afterQuestionRaw, afterQuestion };
|
|
101
|
+
const result = handleParentheticalSplit(ctx, chunks);
|
|
102
|
+
if (result !== null)
|
|
103
|
+
return result;
|
|
104
|
+
}
|
|
105
|
+
if (!isParentheticalClarification(afterQuestion) && hasTextContent(afterQuestion)) {
|
|
106
|
+
const questionPart = remainingText.substring(ZERO, questionIndex + INDEX_OFFSET);
|
|
107
|
+
if (questionPart.length > LONG_QUESTION_THRESHOLD) {
|
|
108
|
+
return handleLongQuestion(remainingText, chunks, questionPart, afterQuestion);
|
|
109
|
+
}
|
|
110
|
+
return handleShortQuestion(remainingText, chunks, questionPart, afterQuestion);
|
|
111
|
+
}
|
|
112
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Processes contiguous questions
|
|
116
|
+
*/
|
|
117
|
+
function processContiguousQuestions(remainingText, chunks, lastQuestionIdx) {
|
|
118
|
+
if (lastQuestionIdx >= remainingText.length - INDEX_OFFSET) {
|
|
119
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
120
|
+
}
|
|
121
|
+
const afterQuestionRaw = remainingText.substring(lastQuestionIdx + INDEX_OFFSET);
|
|
122
|
+
const afterQuestion = smartTrim(afterQuestionRaw);
|
|
123
|
+
if (!hasTextContent(afterQuestion)) {
|
|
124
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
125
|
+
}
|
|
126
|
+
if (startsWithEmoji(afterQuestion)) {
|
|
127
|
+
const emojiEndPos = findPositionAfterEmoji(afterQuestion);
|
|
128
|
+
const emojiPart = afterQuestion.substring(ZERO, emojiEndPos);
|
|
129
|
+
const textAfterEmoji = afterQuestion.substring(emojiEndPos).trim();
|
|
130
|
+
if (textAfterEmoji.length > ZERO) {
|
|
131
|
+
const beforePart = remainingText.substring(ZERO, lastQuestionIdx + INDEX_OFFSET);
|
|
132
|
+
const spaceBeforeEmoji = remainingText.charAt(lastQuestionIdx + INDEX_OFFSET) === ' ' ? ' ' : '';
|
|
133
|
+
chunks.push(beforePart + spaceBeforeEmoji + emojiPart);
|
|
134
|
+
return { splitFound: true, newRemainingText: textAfterEmoji };
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
else {
|
|
138
|
+
const beforePart = remainingText.substring(ZERO, lastQuestionIdx + INDEX_OFFSET);
|
|
139
|
+
const spaceAfterQuestion = remainingText.charAt(lastQuestionIdx + INDEX_OFFSET) === ' ' ? ' ' : '';
|
|
140
|
+
chunks.push(beforePart + spaceAfterQuestion);
|
|
141
|
+
return { splitFound: true, newRemainingText: afterQuestion };
|
|
142
|
+
}
|
|
143
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
144
|
+
}
|
|
145
|
+
/** Check if character is a question mark */
|
|
146
|
+
const isQuestionMark = (text, index) => text[index] === '?';
|
|
147
|
+
/** Get first line from text */
|
|
148
|
+
const getFirstLineFromText = (text) => {
|
|
149
|
+
const firstLineEnd = text.indexOf('\n');
|
|
150
|
+
return firstLineEnd === NOT_FOUND ? text : text.substring(ZERO, firstLineEnd);
|
|
151
|
+
};
|
|
152
|
+
/** Check if position has response options pattern after double newline */
|
|
153
|
+
const hasResponseOptionsAfterDoubleNewline = (afterQuestion) => {
|
|
154
|
+
const doubleNewlineAfter = afterQuestion.indexOf('\n\n');
|
|
155
|
+
if (doubleNewlineAfter === NOT_FOUND || doubleNewlineAfter >= DOUBLE_NEWLINE_DISTANCE_THRESHOLD) {
|
|
156
|
+
return false;
|
|
157
|
+
}
|
|
158
|
+
const afterDoubleNewline = afterQuestion.substring(doubleNewlineAfter + DOUBLE_NEWLINE_OFFSET);
|
|
159
|
+
const firstLine = getFirstLineFromText(afterDoubleNewline);
|
|
160
|
+
const trimmedFirstLine = firstLine.trim();
|
|
161
|
+
const hasResponseOptions = trimmedFirstLine.length < SHORT_INTRO_THRESHOLD &&
|
|
162
|
+
trimmedFirstLine.endsWith(':') &&
|
|
163
|
+
afterDoubleNewline.includes('\n-');
|
|
164
|
+
return hasResponseOptions;
|
|
165
|
+
};
|
|
166
|
+
/** Check if position is valid for question split */
|
|
167
|
+
const isValidQuestionPosition = (remainingText, position) => {
|
|
168
|
+
if (!isQuestionMark(remainingText, position))
|
|
169
|
+
return false;
|
|
170
|
+
if (isPositionInBulletLine(remainingText, position))
|
|
171
|
+
return false;
|
|
172
|
+
if (isPositionInsideParentheses(remainingText, position))
|
|
173
|
+
return false;
|
|
174
|
+
const afterQuestion = remainingText.substring(position + INDEX_OFFSET);
|
|
175
|
+
return !hasResponseOptionsAfterDoubleNewline(afterQuestion);
|
|
176
|
+
};
|
|
177
|
+
/**
|
|
178
|
+
* Finds all question mark indices that are valid split points
|
|
179
|
+
*/
|
|
180
|
+
function findValidQuestionIndices(remainingText) {
|
|
181
|
+
const questionIndices = [];
|
|
182
|
+
const { length: textLength } = remainingText;
|
|
183
|
+
for (let i = ZERO; i < textLength; i += INCREMENT) {
|
|
184
|
+
if (isValidQuestionPosition(remainingText, i)) {
|
|
185
|
+
questionIndices.push(i);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return questionIndices;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Processes question marks for splitting
|
|
192
|
+
*/
|
|
193
|
+
export function processQuestionMarks(remainingText, chunks) {
|
|
194
|
+
const questionIndices = findValidQuestionIndices(remainingText);
|
|
195
|
+
if (questionIndices.length === ZERO) {
|
|
196
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
197
|
+
}
|
|
198
|
+
const [firstQuestionIdx] = questionIndices;
|
|
199
|
+
const lastQuestionIdx = getLastElement(questionIndices);
|
|
200
|
+
if (firstQuestionIdx === undefined || lastQuestionIdx === undefined) {
|
|
201
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
202
|
+
}
|
|
203
|
+
const areContiguous = questionIndices.length > INDEX_OFFSET &&
|
|
204
|
+
(() => {
|
|
205
|
+
const textBetween = remainingText.substring(firstQuestionIdx + INDEX_OFFSET, lastQuestionIdx);
|
|
206
|
+
return (!textBetween.includes('.') && smartTrim(textBetween).length < CONTIGUOUS_QUESTIONS_TEXT_THRESHOLD);
|
|
207
|
+
})();
|
|
208
|
+
if (areContiguous) {
|
|
209
|
+
return processContiguousQuestions(remainingText, chunks, lastQuestionIdx);
|
|
210
|
+
}
|
|
211
|
+
return processSingleQuestion(remainingText, chunks, firstQuestionIdx);
|
|
212
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/** Result of finding a markdown section */
|
|
2
|
+
export interface MarkdownSectionResult {
|
|
3
|
+
header: string;
|
|
4
|
+
content: string;
|
|
5
|
+
fullSection: string;
|
|
6
|
+
}
|
|
7
|
+
/** Result of finding a list section */
|
|
8
|
+
export interface ListSectionResult {
|
|
9
|
+
start: number;
|
|
10
|
+
end: number;
|
|
11
|
+
type: 'numbered' | 'bullet';
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Detects if text starts with a markdown section header
|
|
15
|
+
* Returns the header and its content if found
|
|
16
|
+
*/
|
|
17
|
+
export declare const findMarkdownSection: (text: string) => MarkdownSectionResult | null;
|
|
18
|
+
/**
|
|
19
|
+
* Detects if text is within a list section (numbered or bulleted)
|
|
20
|
+
* Returns the boundaries of the list if found
|
|
21
|
+
*/
|
|
22
|
+
export declare const findListSection: (text: string) => ListSectionResult | null;
|
|
23
|
+
//# sourceMappingURL=sections.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sections.d.ts","sourceRoot":"","sources":["../../src/chatSplit/sections.ts"],"names":[],"mappings":"AAAA,2CAA2C;AAC3C,MAAM,WAAW,qBAAqB;IACpC,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED,uCAAuC;AACvC,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,UAAU,GAAG,QAAQ,CAAC;CAC7B;AAiCD;;;GAGG;AACH,eAAO,MAAM,mBAAmB,GAAI,MAAM,MAAM,KAAG,qBAAqB,GAAG,IA4B1E,CAAC;AAgJF;;;GAGG;AACH,eAAO,MAAM,eAAe,GAAI,MAAM,MAAM,KAAG,iBAAiB,GAAG,IAiBlE,CAAC"}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/** Constants for section processing */
|
|
2
|
+
const NOT_FOUND = -1;
|
|
3
|
+
const FIRST_ELEMENT = 1;
|
|
4
|
+
const DOUBLE_NEWLINE_LENGTH = 2;
|
|
5
|
+
const ZERO = 0;
|
|
6
|
+
const INCREMENT = 1;
|
|
7
|
+
/** Check if text after double newline starts with a markdown header */
|
|
8
|
+
const startsWithMarkdownHeader = (text) => /^(?:\*[^*\n]+\*|_[^_\n]+_)\s*\n/v.test(text);
|
|
9
|
+
/** Check if text starts with a bullet list item */
|
|
10
|
+
const startsWithBullet = (text) => /^[\-•]\s+/v.test(text.trim());
|
|
11
|
+
/** Determine end index based on content after double newline */
|
|
12
|
+
const determineEndIndex = (afterDoubleNewline, doubleNewlineIndex, defaultEndIndex) => {
|
|
13
|
+
if (startsWithMarkdownHeader(afterDoubleNewline)) {
|
|
14
|
+
return doubleNewlineIndex;
|
|
15
|
+
}
|
|
16
|
+
if (startsWithBullet(afterDoubleNewline)) {
|
|
17
|
+
return defaultEndIndex;
|
|
18
|
+
}
|
|
19
|
+
return doubleNewlineIndex;
|
|
20
|
+
};
|
|
21
|
+
/** Get line at index safely */
|
|
22
|
+
const getLineAt = (lines, index) => lines[index];
|
|
23
|
+
/**
|
|
24
|
+
* Detects if text starts with a markdown section header
|
|
25
|
+
* Returns the header and its content if found
|
|
26
|
+
*/
|
|
27
|
+
export const findMarkdownSection = (text) => {
|
|
28
|
+
const headerMatch = /^(?<header>\*[^*\n]+\*|_[^_\n]+_)\s*\n/v.exec(text);
|
|
29
|
+
if (headerMatch === null) {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
const { groups } = headerMatch;
|
|
33
|
+
const header = groups?.header ?? headerMatch[FIRST_ELEMENT] ?? '';
|
|
34
|
+
const [matchedText = ''] = headerMatch;
|
|
35
|
+
const afterHeader = text.substring(matchedText.length);
|
|
36
|
+
const { length: defaultEndIndex } = afterHeader;
|
|
37
|
+
const doubleNewlineIndex = afterHeader.indexOf('\n\n');
|
|
38
|
+
const endIndex = doubleNewlineIndex === NOT_FOUND
|
|
39
|
+
? defaultEndIndex
|
|
40
|
+
: determineEndIndex(afterHeader.substring(doubleNewlineIndex + DOUBLE_NEWLINE_LENGTH), doubleNewlineIndex, defaultEndIndex);
|
|
41
|
+
const content = afterHeader.substring(ZERO, endIndex);
|
|
42
|
+
const fullSection = matchedText + content;
|
|
43
|
+
return { header, content, fullSection };
|
|
44
|
+
};
|
|
45
|
+
/** Test if line is a numbered list item */
|
|
46
|
+
const isNumberedListItem = (line) => /^\s*\d{1,2}\.\s+/v.test(line);
|
|
47
|
+
/** Test if line is a bullet list item */
|
|
48
|
+
const isBulletListItem = (line) => /^\s*[\-•]\s+/v.test(line);
|
|
49
|
+
/** Find the next non-empty line index in an array of lines */
|
|
50
|
+
const findNextNonEmptyIndex = (lines, startIndex) => {
|
|
51
|
+
for (let j = startIndex; j < lines.length; j += INCREMENT) {
|
|
52
|
+
const line = getLineAt(lines, j);
|
|
53
|
+
if (line !== undefined && line.trim() !== '') {
|
|
54
|
+
return j;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return NOT_FOUND;
|
|
58
|
+
};
|
|
59
|
+
/** Check if next line is a list continuation */
|
|
60
|
+
const isListContinuation = (lines, nextNonEmptyIndex) => {
|
|
61
|
+
if (nextNonEmptyIndex === NOT_FOUND)
|
|
62
|
+
return false;
|
|
63
|
+
const nextLine = getLineAt(lines, nextNonEmptyIndex) ?? '';
|
|
64
|
+
return isNumberedListItem(nextLine) || isBulletListItem(nextLine);
|
|
65
|
+
};
|
|
66
|
+
/** Process a single line in numbered list - returns new state */
|
|
67
|
+
const processNumberedListLine = (line, currentIndex, lines, currentState) => {
|
|
68
|
+
const isNumbered = isNumberedListItem(line);
|
|
69
|
+
const isBullet = isBulletListItem(line);
|
|
70
|
+
if (isNumbered) {
|
|
71
|
+
return { action: 'next', newState: { inList: true, endLineIndex: currentIndex } };
|
|
72
|
+
}
|
|
73
|
+
if (isBullet && currentState.inList) {
|
|
74
|
+
return { action: 'next', newState: { ...currentState, endLineIndex: currentIndex } };
|
|
75
|
+
}
|
|
76
|
+
if (currentState.inList && line.trim() === '') {
|
|
77
|
+
const nextNonEmptyIndex = findNextNonEmptyIndex(lines, currentIndex + INCREMENT);
|
|
78
|
+
const action = isListContinuation(lines, nextNonEmptyIndex) ? 'continue' : 'break';
|
|
79
|
+
return { action, newState: currentState };
|
|
80
|
+
}
|
|
81
|
+
if (currentState.inList) {
|
|
82
|
+
return { action: 'break', newState: currentState };
|
|
83
|
+
}
|
|
84
|
+
return { action: 'next', newState: currentState };
|
|
85
|
+
};
|
|
86
|
+
/** Process numbered list to find end index */
|
|
87
|
+
const processNumberedList = (lines) => {
|
|
88
|
+
let state = { endLineIndex: NOT_FOUND, inList: false };
|
|
89
|
+
for (let i = ZERO; i < lines.length; i += INCREMENT) {
|
|
90
|
+
const line = getLineAt(lines, i);
|
|
91
|
+
if (line === undefined)
|
|
92
|
+
continue;
|
|
93
|
+
const { action, newState } = processNumberedListLine(line, i, lines, state);
|
|
94
|
+
state = newState;
|
|
95
|
+
if (action === 'break')
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
return state.endLineIndex >= ZERO
|
|
99
|
+
? lines.slice(ZERO, state.endLineIndex + INCREMENT).join('\n').length
|
|
100
|
+
: ZERO;
|
|
101
|
+
};
|
|
102
|
+
/** Process a single line in bullet list - returns new state */
|
|
103
|
+
const processBulletListLine = (line, currentIndex, lines, currentState) => {
|
|
104
|
+
const isBullet = /^[\-•]\s+/v.test(line.trim());
|
|
105
|
+
if (isBullet) {
|
|
106
|
+
const { length: newEndIndex } = lines.slice(ZERO, currentIndex + INCREMENT).join('\n');
|
|
107
|
+
return { action: 'next', newState: { inList: true, endIndex: newEndIndex } };
|
|
108
|
+
}
|
|
109
|
+
if (currentState.inList && line.trim() === '') {
|
|
110
|
+
const nextLine = getLineAt(lines, currentIndex + INCREMENT);
|
|
111
|
+
if (nextLine !== undefined && /^[\-•]\s+/v.test(nextLine.trim())) {
|
|
112
|
+
return { action: 'continue', newState: currentState };
|
|
113
|
+
}
|
|
114
|
+
return { action: 'break', newState: currentState };
|
|
115
|
+
}
|
|
116
|
+
if (currentState.inList) {
|
|
117
|
+
return { action: 'break', newState: currentState };
|
|
118
|
+
}
|
|
119
|
+
return { action: 'next', newState: currentState };
|
|
120
|
+
};
|
|
121
|
+
/** Process bullet list to find end index */
|
|
122
|
+
const processBulletList = (lines) => {
|
|
123
|
+
let state = { endIndex: ZERO, inList: false };
|
|
124
|
+
for (let i = ZERO; i < lines.length; i += INCREMENT) {
|
|
125
|
+
const line = getLineAt(lines, i);
|
|
126
|
+
if (line === undefined)
|
|
127
|
+
continue;
|
|
128
|
+
const { action, newState } = processBulletListLine(line, i, lines, state);
|
|
129
|
+
state = newState;
|
|
130
|
+
if (action === 'break')
|
|
131
|
+
break;
|
|
132
|
+
}
|
|
133
|
+
return state.endIndex;
|
|
134
|
+
};
|
|
135
|
+
/**
|
|
136
|
+
* Detects if text is within a list section (numbered or bulleted)
|
|
137
|
+
* Returns the boundaries of the list if found
|
|
138
|
+
*/
|
|
139
|
+
export const findListSection = (text) => {
|
|
140
|
+
const numberedListStart = /^\d{1,2}\.\s+/v;
|
|
141
|
+
const bulletListStart = /^[\-•]\s+/v;
|
|
142
|
+
if (numberedListStart.test(text.trim())) {
|
|
143
|
+
const lines = text.split('\n');
|
|
144
|
+
const endIndex = processNumberedList(lines);
|
|
145
|
+
return { start: ZERO, end: endIndex, type: 'numbered' };
|
|
146
|
+
}
|
|
147
|
+
if (bulletListStart.test(text.trim())) {
|
|
148
|
+
const lines = text.split('\n');
|
|
149
|
+
const endIndex = processBulletList(lines);
|
|
150
|
+
return { start: ZERO, end: endIndex, type: 'bullet' };
|
|
151
|
+
}
|
|
152
|
+
return null;
|
|
153
|
+
};
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Splits chat text into smaller chunks for better readability.
|
|
3
|
+
* Handles various patterns like lists, questions, markdown sections, etc.
|
|
4
|
+
*/
|
|
5
|
+
export declare const splitChatText: (text: string | null | undefined) => string[];
|
|
6
|
+
//# sourceMappingURL=splitChatText.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"splitChatText.d.ts","sourceRoot":"","sources":["../../src/chatSplit/splitChatText.ts"],"names":[],"mappings":"AAoHA;;;GAGG;AACH,eAAO,MAAM,aAAa,GAAI,MAAM,MAAM,GAAG,IAAI,GAAG,SAAS,KAAG,MAAM,EAmCrE,CAAC"}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { processSectionBreaks } from './breakProcessor.js';
|
|
2
|
+
import { ZERO } from './constants.js';
|
|
3
|
+
import { normalizeInlineNumberedList, normalizeInlineProductCardList } from './listNormalization.js';
|
|
4
|
+
import { processListSection } from './listProcessor.js';
|
|
5
|
+
import { mergeSmallChunks } from './mergeProcessor.js';
|
|
6
|
+
import { processLongParagraphSequence, processLongParagraphsAfterIntro, processMarkdownSection, } from './paragraphProcessor.js';
|
|
7
|
+
import { processPeriodSplits } from './periodProcessor.js';
|
|
8
|
+
import { processProductCardLists } from './productCardProcessor.js';
|
|
9
|
+
import { normalizeSpanishPunctuation } from './punctuationNormalization.js';
|
|
10
|
+
import { processQuestionMarks } from './questionProcessor.js';
|
|
11
|
+
import { PERIOD_SPLIT_TEXT_THRESHOLD } from './splitConstants.js';
|
|
12
|
+
import { processIntroWithList, processIntroWithLongParagraphs, processQuestionWithList, } from './splitProcessors.js';
|
|
13
|
+
import { removePeriodsAfterURLs } from './urlNormalization.js';
|
|
14
|
+
/** Run intro and list processors */
|
|
15
|
+
const runIntroAndListProcessors = (remainingText, chunks) => {
|
|
16
|
+
const introResult = processIntroWithList(remainingText, chunks);
|
|
17
|
+
if (introResult.splitFound)
|
|
18
|
+
return introResult;
|
|
19
|
+
const questionListResult = processQuestionWithList(remainingText, chunks);
|
|
20
|
+
if (questionListResult.splitFound)
|
|
21
|
+
return questionListResult;
|
|
22
|
+
const longParaResult = processIntroWithLongParagraphs(remainingText, chunks);
|
|
23
|
+
if (longParaResult.splitFound)
|
|
24
|
+
return longParaResult;
|
|
25
|
+
return null;
|
|
26
|
+
};
|
|
27
|
+
/** Run content structure processors */
|
|
28
|
+
const runContentStructureProcessors = (remainingText, chunks) => {
|
|
29
|
+
const productCardResult = processProductCardLists(remainingText, chunks);
|
|
30
|
+
if (productCardResult.splitFound)
|
|
31
|
+
return productCardResult;
|
|
32
|
+
const listResult = processListSection(remainingText, chunks);
|
|
33
|
+
if (listResult.splitFound)
|
|
34
|
+
return listResult;
|
|
35
|
+
const longParaAfterIntroResult = processLongParagraphsAfterIntro(remainingText, chunks);
|
|
36
|
+
if (longParaAfterIntroResult.splitFound)
|
|
37
|
+
return longParaAfterIntroResult;
|
|
38
|
+
const longParagraphResult = processLongParagraphSequence(remainingText, chunks);
|
|
39
|
+
if (longParagraphResult.splitFound)
|
|
40
|
+
return longParagraphResult;
|
|
41
|
+
return null;
|
|
42
|
+
};
|
|
43
|
+
/** Run formatting processors */
|
|
44
|
+
const runFormattingProcessors = (remainingText, chunks) => {
|
|
45
|
+
const markdownResult = processMarkdownSection(remainingText, chunks);
|
|
46
|
+
if (markdownResult.splitFound)
|
|
47
|
+
return markdownResult;
|
|
48
|
+
const sectionBreakResult = processSectionBreaks(remainingText, chunks);
|
|
49
|
+
if (sectionBreakResult.splitFound)
|
|
50
|
+
return sectionBreakResult;
|
|
51
|
+
return null;
|
|
52
|
+
};
|
|
53
|
+
/**
|
|
54
|
+
* Runs all split processors on the remaining text
|
|
55
|
+
*/
|
|
56
|
+
const runSplitProcessors = (remainingText, chunks) => {
|
|
57
|
+
const introResult = runIntroAndListProcessors(remainingText, chunks);
|
|
58
|
+
if (introResult !== null)
|
|
59
|
+
return introResult;
|
|
60
|
+
const contentResult = runContentStructureProcessors(remainingText, chunks);
|
|
61
|
+
if (contentResult !== null)
|
|
62
|
+
return contentResult;
|
|
63
|
+
const formatResult = runFormattingProcessors(remainingText, chunks);
|
|
64
|
+
if (formatResult !== null)
|
|
65
|
+
return formatResult;
|
|
66
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
67
|
+
};
|
|
68
|
+
/**
|
|
69
|
+
* Runs question and period processors
|
|
70
|
+
*/
|
|
71
|
+
const runQuestionAndPeriodProcessors = (remainingText, chunks) => {
|
|
72
|
+
const questionResult = processQuestionMarks(remainingText, chunks);
|
|
73
|
+
if (questionResult.splitFound) {
|
|
74
|
+
return questionResult;
|
|
75
|
+
}
|
|
76
|
+
if (remainingText.length > PERIOD_SPLIT_TEXT_THRESHOLD) {
|
|
77
|
+
const periodResult = processPeriodSplits(remainingText, chunks);
|
|
78
|
+
if (periodResult.splitFound) {
|
|
79
|
+
return periodResult;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
83
|
+
};
|
|
84
|
+
/** Pre-process text before splitting */
|
|
85
|
+
const preProcessText = (text) => {
|
|
86
|
+
let processedText = removePeriodsAfterURLs(text);
|
|
87
|
+
processedText = normalizeInlineNumberedList(processedText);
|
|
88
|
+
processedText = normalizeInlineProductCardList(processedText);
|
|
89
|
+
return processedText;
|
|
90
|
+
};
|
|
91
|
+
/**
|
|
92
|
+
* Splits chat text into smaller chunks for better readability.
|
|
93
|
+
* Handles various patterns like lists, questions, markdown sections, etc.
|
|
94
|
+
*/
|
|
95
|
+
export const splitChatText = (text) => {
|
|
96
|
+
if (text === null || text === undefined || text.length === ZERO) {
|
|
97
|
+
return [];
|
|
98
|
+
}
|
|
99
|
+
const processedText = preProcessText(text);
|
|
100
|
+
const chunks = [];
|
|
101
|
+
let remainingText = processedText;
|
|
102
|
+
while (remainingText !== '') {
|
|
103
|
+
const { splitFound, newRemainingText } = runSplitProcessors(remainingText, chunks);
|
|
104
|
+
if (splitFound) {
|
|
105
|
+
remainingText = newRemainingText;
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
const { splitFound: qpSplitFound, newRemainingText: qpNewText } = runQuestionAndPeriodProcessors(remainingText, chunks);
|
|
109
|
+
if (qpSplitFound) {
|
|
110
|
+
remainingText = qpNewText;
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
chunks.push(remainingText);
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
const mergedChunks = mergeSmallChunks(chunks);
|
|
117
|
+
const normalizedChunks = mergedChunks.map((chunk) => normalizeSpanishPunctuation(chunk));
|
|
118
|
+
return normalizedChunks;
|
|
119
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"splitConstants.d.ts","sourceRoot":"","sources":["../../src/chatSplit/splitConstants.ts"],"names":[],"mappings":"AAAA,gDAAgD;AAChD,eAAO,MAAM,2BAA2B,MAAM,CAAC"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/** Split result type */
|
|
2
|
+
export interface SplitResult {
|
|
3
|
+
splitFound: boolean;
|
|
4
|
+
newRemainingText: string;
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Checks if text has a question with response options pattern
|
|
8
|
+
*/
|
|
9
|
+
export declare const hasQuestionWithOptionsPattern: (text: string) => boolean;
|
|
10
|
+
/**
|
|
11
|
+
* Processes intro with list pattern and returns split info
|
|
12
|
+
*/
|
|
13
|
+
export declare const processIntroWithList: (remainingText: string, chunks: string[]) => SplitResult;
|
|
14
|
+
/**
|
|
15
|
+
* Processes question followed by numbered list (answer options)
|
|
16
|
+
*/
|
|
17
|
+
export declare const processQuestionWithList: (remainingText: string, chunks: string[]) => SplitResult;
|
|
18
|
+
/**
|
|
19
|
+
* Processes intro followed by long paragraphs
|
|
20
|
+
*/
|
|
21
|
+
export declare const processIntroWithLongParagraphs: (remainingText: string, chunks: string[]) => SplitResult;
|
|
22
|
+
//# sourceMappingURL=splitProcessors.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"splitProcessors.d.ts","sourceRoot":"","sources":["../../src/chatSplit/splitProcessors.ts"],"names":[],"mappings":"AAmBA,wBAAwB;AACxB,MAAM,WAAW,WAAW;IAC1B,UAAU,EAAE,OAAO,CAAC;IACpB,gBAAgB,EAAE,MAAM,CAAC;CAC1B;AAED;;GAEG;AACH,eAAO,MAAM,6BAA6B,GAAI,MAAM,MAAM,KAAG,OACmC,CAAC;AAuCjG;;GAEG;AACH,eAAO,MAAM,oBAAoB,GAAI,eAAe,MAAM,EAAE,QAAQ,MAAM,EAAE,KAAG,WAkB9E,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,uBAAuB,GAAI,eAAe,MAAM,EAAE,QAAQ,MAAM,EAAE,KAAG,WA0BjF,CAAC;AAMF;;GAEG;AACH,eAAO,MAAM,8BAA8B,GAAI,eAAe,MAAM,EAAE,QAAQ,MAAM,EAAE,KAAG,WAuBxF,CAAC"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { FIRST_NEWLINE_SEARCH_LIMIT, INDEX_OFFSET, MAX_INTRO_LENGTH, MAX_QUESTION_WITH_OPTIONS_LENGTH, } from './constants.js';
|
|
2
|
+
/** Long paragraph threshold */
|
|
3
|
+
const LONG_PARAGRAPH_THRESHOLD = 150;
|
|
4
|
+
/** Minimum list items for question with options */
|
|
5
|
+
const MIN_LIST_ITEMS_FOR_OPTIONS = 2;
|
|
6
|
+
/** Constants */
|
|
7
|
+
const NOT_FOUND = -1;
|
|
8
|
+
const ZERO = 0;
|
|
9
|
+
const FIRST_MATCH = 1;
|
|
10
|
+
const SECOND_MATCH = 2;
|
|
11
|
+
/**
|
|
12
|
+
* Checks if text has a question with response options pattern
|
|
13
|
+
*/
|
|
14
|
+
export const hasQuestionWithOptionsPattern = (text) => /^[^?]+\?\s*\n+[\s\S]*?(?:Puedes responder con|puedes responder con):[\s\S]*?\n+-/v.test(text);
|
|
15
|
+
/** Check if line ends with colon */
|
|
16
|
+
const lineEndsWithColon = (line) => line?.trim().endsWith(':') === true;
|
|
17
|
+
/** Find the last line ending with colon */
|
|
18
|
+
const findLastColonLineIndex = (lines) => {
|
|
19
|
+
for (let i = lines.length - INDEX_OFFSET; i >= ZERO; i -= INDEX_OFFSET) {
|
|
20
|
+
if (lineEndsWithColon(lines[i])) {
|
|
21
|
+
return i;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return NOT_FOUND;
|
|
25
|
+
};
|
|
26
|
+
/** Extract intro from match groups */
|
|
27
|
+
const extractIntro = (match) => {
|
|
28
|
+
const { groups } = match;
|
|
29
|
+
const part1 = groups?.intro ?? match[FIRST_MATCH] ?? '';
|
|
30
|
+
const part2 = groups?.afterColon ?? match[SECOND_MATCH] ?? '';
|
|
31
|
+
return part1 + part2;
|
|
32
|
+
};
|
|
33
|
+
/** Adjust intro for response prompts */
|
|
34
|
+
const adjustIntroForResponsePrompt = (intro) => {
|
|
35
|
+
if (!intro.includes('Puedes responder con:') && !intro.includes('puedes responder con:')) {
|
|
36
|
+
return intro;
|
|
37
|
+
}
|
|
38
|
+
const lines = intro.split('\n');
|
|
39
|
+
const lastColonIndex = findLastColonLineIndex(lines);
|
|
40
|
+
if (lastColonIndex >= ZERO) {
|
|
41
|
+
return lines.slice(ZERO, lastColonIndex + INDEX_OFFSET).join('\n');
|
|
42
|
+
}
|
|
43
|
+
return intro;
|
|
44
|
+
};
|
|
45
|
+
/**
|
|
46
|
+
* Processes intro with list pattern and returns split info
|
|
47
|
+
*/
|
|
48
|
+
export const processIntroWithList = (remainingText, chunks) => {
|
|
49
|
+
const match = /^(?<intro>.+?:)(?<afterColon>[^\n]*?)\n+(?<listStart>\d{1,2}\.\s+|[\-•]\s+)/v.exec(remainingText);
|
|
50
|
+
if (match === null) {
|
|
51
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
52
|
+
}
|
|
53
|
+
const intro = extractIntro(match);
|
|
54
|
+
const finalIntro = adjustIntroForResponsePrompt(intro);
|
|
55
|
+
if (finalIntro.length < MAX_INTRO_LENGTH) {
|
|
56
|
+
chunks.push(finalIntro);
|
|
57
|
+
return { splitFound: true, newRemainingText: remainingText.substring(finalIntro.length).trim() };
|
|
58
|
+
}
|
|
59
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
60
|
+
};
|
|
61
|
+
/**
|
|
62
|
+
* Processes question followed by numbered list (answer options)
|
|
63
|
+
*/
|
|
64
|
+
export const processQuestionWithList = (remainingText, chunks) => {
|
|
65
|
+
const match = /^(?<question>[\s\S]*?\?[^\n]*?)\n(?<list>\d{1,2}\.\s+[\s\S]*)/v.exec(remainingText);
|
|
66
|
+
if (match === null) {
|
|
67
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
68
|
+
}
|
|
69
|
+
const { groups } = match;
|
|
70
|
+
const listPart = groups?.list ?? match[SECOND_MATCH] ?? '';
|
|
71
|
+
const listLines = listPart.split('\n');
|
|
72
|
+
const allLinesAreListItems = listLines.every((line) => line.trim() === '' || /^\d{1,2}\.\s+/v.test(line.trim()));
|
|
73
|
+
const { length: listItemCount } = listLines.filter((l) => /^\d{1,2}\.\s+/v.test(l.trim()));
|
|
74
|
+
const shouldProcess = allLinesAreListItems &&
|
|
75
|
+
remainingText.length < MAX_QUESTION_WITH_OPTIONS_LENGTH &&
|
|
76
|
+
listItemCount >= MIN_LIST_ITEMS_FOR_OPTIONS;
|
|
77
|
+
if (shouldProcess) {
|
|
78
|
+
chunks.push(remainingText);
|
|
79
|
+
return { splitFound: true, newRemainingText: '' };
|
|
80
|
+
}
|
|
81
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
82
|
+
};
|
|
83
|
+
/** Check if intro ends with list pattern */
|
|
84
|
+
const endsWithListPattern = (intro) => /\d{1,2}\.\s*$/v.exec(intro) !== null || /[\-•]\s*$/v.exec(intro) !== null;
|
|
85
|
+
/**
|
|
86
|
+
* Processes intro followed by long paragraphs
|
|
87
|
+
*/
|
|
88
|
+
export const processIntroWithLongParagraphs = (remainingText, chunks) => {
|
|
89
|
+
const firstNewline = remainingText.indexOf('\n');
|
|
90
|
+
if (firstNewline === NOT_FOUND || firstNewline >= FIRST_NEWLINE_SEARCH_LIMIT) {
|
|
91
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
92
|
+
}
|
|
93
|
+
const intro = remainingText.substring(ZERO, firstNewline).trim();
|
|
94
|
+
if (!intro.endsWith(':') || endsWithListPattern(intro)) {
|
|
95
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
96
|
+
}
|
|
97
|
+
const afterIntro = remainingText.substring(firstNewline + INDEX_OFFSET);
|
|
98
|
+
const [firstParagraphRaw] = afterIntro.split('\n');
|
|
99
|
+
const firstParagraph = (firstParagraphRaw ?? '').trim();
|
|
100
|
+
if (firstParagraph.length > LONG_PARAGRAPH_THRESHOLD) {
|
|
101
|
+
chunks.push(intro);
|
|
102
|
+
return { splitFound: true, newRemainingText: afterIntro };
|
|
103
|
+
}
|
|
104
|
+
return { splitFound: false, newRemainingText: remainingText };
|
|
105
|
+
};
|