@daviddh/llm-markdown-whatsapp 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.prettierrc +17 -0
  2. package/CLAUDE.md +155 -0
  3. package/README.md +304 -0
  4. package/eslint.config.mjs +28 -0
  5. package/jest.config.js +40 -0
  6. package/package.json +61 -0
  7. package/packages/core/dist/__tests__/splitChatText.basic.test.d.ts +2 -0
  8. package/packages/core/dist/__tests__/splitChatText.basic.test.d.ts.map +1 -0
  9. package/packages/core/dist/__tests__/splitChatText.basic.test.js +100 -0
  10. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.d.ts +2 -0
  11. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.d.ts.map +1 -0
  12. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.js +88 -0
  13. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.d.ts +2 -0
  14. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.d.ts.map +1 -0
  15. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.js +108 -0
  16. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.d.ts +2 -0
  17. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.d.ts.map +1 -0
  18. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.js +74 -0
  19. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.d.ts +2 -0
  20. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.d.ts.map +1 -0
  21. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.js +80 -0
  22. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.d.ts +2 -0
  23. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.d.ts.map +1 -0
  24. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.js +124 -0
  25. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.d.ts +2 -0
  26. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.d.ts.map +1 -0
  27. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.js +122 -0
  28. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.d.ts +2 -0
  29. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.d.ts.map +1 -0
  30. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.js +132 -0
  31. package/packages/core/dist/__tests__/splitChatText.helpers.d.ts +2 -0
  32. package/packages/core/dist/__tests__/splitChatText.helpers.d.ts.map +1 -0
  33. package/packages/core/dist/__tests__/splitChatText.helpers.js +5 -0
  34. package/packages/core/dist/__tests__/splitChatText.punctuation.test.d.ts +2 -0
  35. package/packages/core/dist/__tests__/splitChatText.punctuation.test.d.ts.map +1 -0
  36. package/packages/core/dist/__tests__/splitChatText.punctuation.test.js +98 -0
  37. package/packages/core/dist/__tests__/splitChatText.realWorld.test.d.ts +2 -0
  38. package/packages/core/dist/__tests__/splitChatText.realWorld.test.d.ts.map +1 -0
  39. package/packages/core/dist/__tests__/splitChatText.realWorld.test.js +104 -0
  40. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.d.ts +2 -0
  41. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.d.ts.map +1 -0
  42. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.js +82 -0
  43. package/packages/core/dist/__tests__/strs.splitChatText.test.d.ts +2 -0
  44. package/packages/core/dist/__tests__/strs.splitChatText.test.d.ts.map +1 -0
  45. package/packages/core/dist/__tests__/strs.splitChatText.test.js +992 -0
  46. package/packages/core/dist/chatSplit/breakProcessor.d.ts +4 -0
  47. package/packages/core/dist/chatSplit/breakProcessor.d.ts.map +1 -0
  48. package/packages/core/dist/chatSplit/breakProcessor.js +67 -0
  49. package/packages/core/dist/chatSplit/constants.d.ts +35 -0
  50. package/packages/core/dist/chatSplit/constants.d.ts.map +1 -0
  51. package/packages/core/dist/chatSplit/constants.js +34 -0
  52. package/packages/core/dist/chatSplit/index.d.ts +2 -0
  53. package/packages/core/dist/chatSplit/index.d.ts.map +1 -0
  54. package/packages/core/dist/chatSplit/index.js +1 -0
  55. package/packages/core/dist/chatSplit/listNormalization.d.ts +13 -0
  56. package/packages/core/dist/chatSplit/listNormalization.d.ts.map +1 -0
  57. package/packages/core/dist/chatSplit/listNormalization.js +140 -0
  58. package/packages/core/dist/chatSplit/listProcessor.d.ts +6 -0
  59. package/packages/core/dist/chatSplit/listProcessor.d.ts.map +1 -0
  60. package/packages/core/dist/chatSplit/listProcessor.js +61 -0
  61. package/packages/core/dist/chatSplit/mergeProcessor.d.ts +3 -0
  62. package/packages/core/dist/chatSplit/mergeProcessor.d.ts.map +1 -0
  63. package/packages/core/dist/chatSplit/mergeProcessor.js +88 -0
  64. package/packages/core/dist/chatSplit/paragraphProcessor.d.ts +14 -0
  65. package/packages/core/dist/chatSplit/paragraphProcessor.d.ts.map +1 -0
  66. package/packages/core/dist/chatSplit/paragraphProcessor.js +66 -0
  67. package/packages/core/dist/chatSplit/periodProcessor.d.ts +4 -0
  68. package/packages/core/dist/chatSplit/periodProcessor.d.ts.map +1 -0
  69. package/packages/core/dist/chatSplit/periodProcessor.js +110 -0
  70. package/packages/core/dist/chatSplit/positionHelpers.d.ts +12 -0
  71. package/packages/core/dist/chatSplit/positionHelpers.d.ts.map +1 -0
  72. package/packages/core/dist/chatSplit/positionHelpers.js +57 -0
  73. package/packages/core/dist/chatSplit/productCardProcessor.d.ts +12 -0
  74. package/packages/core/dist/chatSplit/productCardProcessor.d.ts.map +1 -0
  75. package/packages/core/dist/chatSplit/productCardProcessor.js +138 -0
  76. package/packages/core/dist/chatSplit/punctuationNormalization.d.ts +5 -0
  77. package/packages/core/dist/chatSplit/punctuationNormalization.d.ts.map +1 -0
  78. package/packages/core/dist/chatSplit/punctuationNormalization.js +103 -0
  79. package/packages/core/dist/chatSplit/questionProcessor.d.ts +6 -0
  80. package/packages/core/dist/chatSplit/questionProcessor.d.ts.map +1 -0
  81. package/packages/core/dist/chatSplit/questionProcessor.js +212 -0
  82. package/packages/core/dist/chatSplit/sections.d.ts +23 -0
  83. package/packages/core/dist/chatSplit/sections.d.ts.map +1 -0
  84. package/packages/core/dist/chatSplit/sections.js +153 -0
  85. package/packages/core/dist/chatSplit/splitChatText.d.ts +6 -0
  86. package/packages/core/dist/chatSplit/splitChatText.d.ts.map +1 -0
  87. package/packages/core/dist/chatSplit/splitChatText.js +119 -0
  88. package/packages/core/dist/chatSplit/splitConstants.d.ts +3 -0
  89. package/packages/core/dist/chatSplit/splitConstants.d.ts.map +1 -0
  90. package/packages/core/dist/chatSplit/splitConstants.js +2 -0
  91. package/packages/core/dist/chatSplit/splitProcessors.d.ts +22 -0
  92. package/packages/core/dist/chatSplit/splitProcessors.d.ts.map +1 -0
  93. package/packages/core/dist/chatSplit/splitProcessors.js +105 -0
  94. package/packages/core/dist/chatSplit/textHelpers.d.ts +27 -0
  95. package/packages/core/dist/chatSplit/textHelpers.d.ts.map +1 -0
  96. package/packages/core/dist/chatSplit/textHelpers.js +77 -0
  97. package/packages/core/dist/chatSplit/urlNormalization.d.ts +7 -0
  98. package/packages/core/dist/chatSplit/urlNormalization.d.ts.map +1 -0
  99. package/packages/core/dist/chatSplit/urlNormalization.js +13 -0
  100. package/packages/core/dist/index.d.ts +2 -0
  101. package/packages/core/dist/index.d.ts.map +1 -0
  102. package/packages/core/dist/index.js +1 -0
  103. package/packages/core/jest.config.js +23 -0
  104. package/packages/core/package.json +38 -0
  105. package/packages/core/src/__tests__/splitChatText.basic.test.ts +123 -0
  106. package/packages/core/src/__tests__/splitChatText.coverageLists.test.ts +108 -0
  107. package/packages/core/src/__tests__/splitChatText.coverageProcessors.test.ts +172 -0
  108. package/packages/core/src/__tests__/splitChatText.coverageQuestions.test.ts +95 -0
  109. package/packages/core/src/__tests__/splitChatText.dataProtection.test.ts +96 -0
  110. package/packages/core/src/__tests__/splitChatText.dataTests1.test.ts +137 -0
  111. package/packages/core/src/__tests__/splitChatText.dataTests2.test.ts +134 -0
  112. package/packages/core/src/__tests__/splitChatText.edgeCases.test.ts +157 -0
  113. package/packages/core/src/__tests__/splitChatText.helpers.ts +6 -0
  114. package/packages/core/src/__tests__/splitChatText.punctuation.test.ts +113 -0
  115. package/packages/core/src/__tests__/splitChatText.realWorld.test.ts +118 -0
  116. package/packages/core/src/__tests__/splitChatText.urlProtection.test.ts +102 -0
  117. package/packages/core/src/chatSplit/breakProcessor.ts +103 -0
  118. package/packages/core/src/chatSplit/constants.ts +50 -0
  119. package/packages/core/src/chatSplit/index.ts +1 -0
  120. package/packages/core/src/chatSplit/listNormalization.ts +189 -0
  121. package/packages/core/src/chatSplit/listProcessor.ts +74 -0
  122. package/packages/core/src/chatSplit/mergeProcessor.ts +124 -0
  123. package/packages/core/src/chatSplit/paragraphProcessor.ts +86 -0
  124. package/packages/core/src/chatSplit/periodProcessor.ts +148 -0
  125. package/packages/core/src/chatSplit/positionHelpers.ts +66 -0
  126. package/packages/core/src/chatSplit/productCardProcessor.ts +184 -0
  127. package/packages/core/src/chatSplit/punctuationNormalization.ts +142 -0
  128. package/packages/core/src/chatSplit/questionProcessor.ts +298 -0
  129. package/packages/core/src/chatSplit/sections.ts +243 -0
  130. package/packages/core/src/chatSplit/splitChatText.ts +156 -0
  131. package/packages/core/src/chatSplit/splitConstants.ts +2 -0
  132. package/packages/core/src/chatSplit/splitProcessors.ts +153 -0
  133. package/packages/core/src/chatSplit/textHelpers.ts +86 -0
  134. package/packages/core/src/chatSplit/urlNormalization.ts +17 -0
  135. package/packages/core/src/index.ts +1 -0
  136. package/packages/core/tsconfig.build.json +4 -0
  137. package/packages/core/tsconfig.json +25 -0
  138. package/tsconfig.json +19 -0
@@ -0,0 +1,243 @@
1
+ /** Result of finding a markdown section */
2
+ export interface MarkdownSectionResult {
3
+ header: string;
4
+ content: string;
5
+ fullSection: string;
6
+ }
7
+
8
+ /** Result of finding a list section */
9
+ export interface ListSectionResult {
10
+ start: number;
11
+ end: number;
12
+ type: 'numbered' | 'bullet';
13
+ }
14
+
15
+ /** Constants for section processing */
16
+ const NOT_FOUND = -1;
17
+ const FIRST_ELEMENT = 1;
18
+ const DOUBLE_NEWLINE_LENGTH = 2;
19
+ const ZERO = 0;
20
+ const INCREMENT = 1;
21
+
22
+ /** Check if text after double newline starts with a markdown header */
23
+ const startsWithMarkdownHeader = (text: string): boolean => /^(?:\*[^*\n]+\*|_[^_\n]+_)\s*\n/v.test(text);
24
+
25
+ /** Check if text starts with a bullet list item */
26
+ const startsWithBullet = (text: string): boolean => /^[\-•]\s+/v.test(text.trim());
27
+
28
+ /** Determine end index based on content after double newline */
29
+ const determineEndIndex = (
30
+ afterDoubleNewline: string,
31
+ doubleNewlineIndex: number,
32
+ defaultEndIndex: number
33
+ ): number => {
34
+ if (startsWithMarkdownHeader(afterDoubleNewline)) {
35
+ return doubleNewlineIndex;
36
+ }
37
+ if (startsWithBullet(afterDoubleNewline)) {
38
+ return defaultEndIndex;
39
+ }
40
+ return doubleNewlineIndex;
41
+ };
42
+
43
+ /** Get line at index safely */
44
+ const getLineAt = (lines: string[], index: number): string | undefined => lines[index];
45
+
46
+ /**
47
+ * Detects if text starts with a markdown section header
48
+ * Returns the header and its content if found
49
+ */
50
+ export const findMarkdownSection = (text: string): MarkdownSectionResult | null => {
51
+ const headerMatch = /^(?<header>\*[^*\n]+\*|_[^_\n]+_)\s*\n/v.exec(text);
52
+
53
+ if (headerMatch === null) {
54
+ return null;
55
+ }
56
+
57
+ const { groups } = headerMatch;
58
+ const header = groups?.header ?? headerMatch[FIRST_ELEMENT] ?? '';
59
+ const [matchedText = ''] = headerMatch;
60
+ const afterHeader = text.substring(matchedText.length);
61
+ const { length: defaultEndIndex } = afterHeader;
62
+
63
+ const doubleNewlineIndex = afterHeader.indexOf('\n\n');
64
+
65
+ const endIndex =
66
+ doubleNewlineIndex === NOT_FOUND
67
+ ? defaultEndIndex
68
+ : determineEndIndex(
69
+ afterHeader.substring(doubleNewlineIndex + DOUBLE_NEWLINE_LENGTH),
70
+ doubleNewlineIndex,
71
+ defaultEndIndex
72
+ );
73
+
74
+ const content = afterHeader.substring(ZERO, endIndex);
75
+ const fullSection = matchedText + content;
76
+
77
+ return { header, content, fullSection };
78
+ };
79
+
80
+ /** Test if line is a numbered list item */
81
+ const isNumberedListItem = (line: string): boolean => /^\s*\d{1,2}\.\s+/v.test(line);
82
+
83
+ /** Test if line is a bullet list item */
84
+ const isBulletListItem = (line: string): boolean => /^\s*[\-•]\s+/v.test(line);
85
+
86
+ /** Find the next non-empty line index in an array of lines */
87
+ const findNextNonEmptyIndex = (lines: string[], startIndex: number): number => {
88
+ for (let j = startIndex; j < lines.length; j += INCREMENT) {
89
+ const line = getLineAt(lines, j);
90
+ if (line !== undefined && line.trim() !== '') {
91
+ return j;
92
+ }
93
+ }
94
+ return NOT_FOUND;
95
+ };
96
+
97
+ /** Check if next line is a list continuation */
98
+ const isListContinuation = (lines: string[], nextNonEmptyIndex: number): boolean => {
99
+ if (nextNonEmptyIndex === NOT_FOUND) return false;
100
+ const nextLine = getLineAt(lines, nextNonEmptyIndex) ?? '';
101
+ return isNumberedListItem(nextLine) || isBulletListItem(nextLine);
102
+ };
103
+
104
+ /** Numbered list state */
105
+ interface NumberedListState {
106
+ endLineIndex: number;
107
+ inList: boolean;
108
+ }
109
+
110
+ /** Process numbered list line result */
111
+ interface NumberedListLineResult {
112
+ action: 'continue' | 'break' | 'next';
113
+ newState: NumberedListState;
114
+ }
115
+
116
+ /** Process a single line in numbered list - returns new state */
117
+ const processNumberedListLine = (
118
+ line: string,
119
+ currentIndex: number,
120
+ lines: string[],
121
+ currentState: NumberedListState
122
+ ): NumberedListLineResult => {
123
+ const isNumbered = isNumberedListItem(line);
124
+ const isBullet = isBulletListItem(line);
125
+
126
+ if (isNumbered) {
127
+ return { action: 'next', newState: { inList: true, endLineIndex: currentIndex } };
128
+ }
129
+
130
+ if (isBullet && currentState.inList) {
131
+ return { action: 'next', newState: { ...currentState, endLineIndex: currentIndex } };
132
+ }
133
+
134
+ if (currentState.inList && line.trim() === '') {
135
+ const nextNonEmptyIndex = findNextNonEmptyIndex(lines, currentIndex + INCREMENT);
136
+ const action = isListContinuation(lines, nextNonEmptyIndex) ? 'continue' : 'break';
137
+ return { action, newState: currentState };
138
+ }
139
+
140
+ if (currentState.inList) {
141
+ return { action: 'break', newState: currentState };
142
+ }
143
+
144
+ return { action: 'next', newState: currentState };
145
+ };
146
+
147
+ /** Process numbered list to find end index */
148
+ const processNumberedList = (lines: string[]): number => {
149
+ let state: NumberedListState = { endLineIndex: NOT_FOUND, inList: false };
150
+
151
+ for (let i = ZERO; i < lines.length; i += INCREMENT) {
152
+ const line = getLineAt(lines, i);
153
+ if (line === undefined) continue;
154
+
155
+ const { action, newState } = processNumberedListLine(line, i, lines, state);
156
+ state = newState;
157
+ if (action === 'break') break;
158
+ }
159
+
160
+ return state.endLineIndex >= ZERO
161
+ ? lines.slice(ZERO, state.endLineIndex + INCREMENT).join('\n').length
162
+ : ZERO;
163
+ };
164
+
165
+ /** Bullet list state */
166
+ interface BulletListState {
167
+ endIndex: number;
168
+ inList: boolean;
169
+ }
170
+
171
+ /** Process bullet list line result */
172
+ interface BulletListLineResult {
173
+ action: 'continue' | 'break' | 'next';
174
+ newState: BulletListState;
175
+ }
176
+
177
+ /** Process a single line in bullet list - returns new state */
178
+ const processBulletListLine = (
179
+ line: string,
180
+ currentIndex: number,
181
+ lines: string[],
182
+ currentState: BulletListState
183
+ ): BulletListLineResult => {
184
+ const isBullet = /^[\-•]\s+/v.test(line.trim());
185
+
186
+ if (isBullet) {
187
+ const { length: newEndIndex } = lines.slice(ZERO, currentIndex + INCREMENT).join('\n');
188
+ return { action: 'next', newState: { inList: true, endIndex: newEndIndex } };
189
+ }
190
+
191
+ if (currentState.inList && line.trim() === '') {
192
+ const nextLine = getLineAt(lines, currentIndex + INCREMENT);
193
+ if (nextLine !== undefined && /^[\-•]\s+/v.test(nextLine.trim())) {
194
+ return { action: 'continue', newState: currentState };
195
+ }
196
+ return { action: 'break', newState: currentState };
197
+ }
198
+
199
+ if (currentState.inList) {
200
+ return { action: 'break', newState: currentState };
201
+ }
202
+
203
+ return { action: 'next', newState: currentState };
204
+ };
205
+
206
+ /** Process bullet list to find end index */
207
+ const processBulletList = (lines: string[]): number => {
208
+ let state: BulletListState = { endIndex: ZERO, inList: false };
209
+
210
+ for (let i = ZERO; i < lines.length; i += INCREMENT) {
211
+ const line = getLineAt(lines, i);
212
+ if (line === undefined) continue;
213
+
214
+ const { action, newState } = processBulletListLine(line, i, lines, state);
215
+ state = newState;
216
+ if (action === 'break') break;
217
+ }
218
+
219
+ return state.endIndex;
220
+ };
221
+
222
+ /**
223
+ * Detects if text is within a list section (numbered or bulleted)
224
+ * Returns the boundaries of the list if found
225
+ */
226
+ export const findListSection = (text: string): ListSectionResult | null => {
227
+ const numberedListStart = /^\d{1,2}\.\s+/v;
228
+ const bulletListStart = /^[\-•]\s+/v;
229
+
230
+ if (numberedListStart.test(text.trim())) {
231
+ const lines = text.split('\n');
232
+ const endIndex = processNumberedList(lines);
233
+ return { start: ZERO, end: endIndex, type: 'numbered' };
234
+ }
235
+
236
+ if (bulletListStart.test(text.trim())) {
237
+ const lines = text.split('\n');
238
+ const endIndex = processBulletList(lines);
239
+ return { start: ZERO, end: endIndex, type: 'bullet' };
240
+ }
241
+
242
+ return null;
243
+ };
@@ -0,0 +1,156 @@
1
+ import { processSectionBreaks } from './breakProcessor.js';
2
+ import { ZERO } from './constants.js';
3
+ import { normalizeInlineNumberedList, normalizeInlineProductCardList } from './listNormalization.js';
4
+ import { processListSection } from './listProcessor.js';
5
+ import { mergeSmallChunks } from './mergeProcessor.js';
6
+ import {
7
+ processLongParagraphSequence,
8
+ processLongParagraphsAfterIntro,
9
+ processMarkdownSection,
10
+ } from './paragraphProcessor.js';
11
+ import { processPeriodSplits } from './periodProcessor.js';
12
+ import { processProductCardLists } from './productCardProcessor.js';
13
+ import { normalizeSpanishPunctuation } from './punctuationNormalization.js';
14
+ import { processQuestionMarks } from './questionProcessor.js';
15
+ import { PERIOD_SPLIT_TEXT_THRESHOLD } from './splitConstants.js';
16
+ import {
17
+ processIntroWithList,
18
+ processIntroWithLongParagraphs,
19
+ processQuestionWithList,
20
+ } from './splitProcessors.js';
21
+ import { removePeriodsAfterURLs } from './urlNormalization.js';
22
+
23
+ /** Split result type for processor functions */
24
+ interface SplitProcessorResult {
25
+ splitFound: boolean;
26
+ newRemainingText: string;
27
+ }
28
+
29
+ /** Run intro and list processors */
30
+ const runIntroAndListProcessors = (remainingText: string, chunks: string[]): SplitProcessorResult | null => {
31
+ const introResult = processIntroWithList(remainingText, chunks);
32
+ if (introResult.splitFound) return introResult;
33
+
34
+ const questionListResult = processQuestionWithList(remainingText, chunks);
35
+ if (questionListResult.splitFound) return questionListResult;
36
+
37
+ const longParaResult = processIntroWithLongParagraphs(remainingText, chunks);
38
+ if (longParaResult.splitFound) return longParaResult;
39
+
40
+ return null;
41
+ };
42
+
43
+ /** Run content structure processors */
44
+ const runContentStructureProcessors = (
45
+ remainingText: string,
46
+ chunks: string[]
47
+ ): SplitProcessorResult | null => {
48
+ const productCardResult = processProductCardLists(remainingText, chunks);
49
+ if (productCardResult.splitFound) return productCardResult;
50
+
51
+ const listResult = processListSection(remainingText, chunks);
52
+ if (listResult.splitFound) return listResult;
53
+
54
+ const longParaAfterIntroResult = processLongParagraphsAfterIntro(remainingText, chunks);
55
+ if (longParaAfterIntroResult.splitFound) return longParaAfterIntroResult;
56
+
57
+ const longParagraphResult = processLongParagraphSequence(remainingText, chunks);
58
+ if (longParagraphResult.splitFound) return longParagraphResult;
59
+
60
+ return null;
61
+ };
62
+
63
+ /** Run formatting processors */
64
+ const runFormattingProcessors = (remainingText: string, chunks: string[]): SplitProcessorResult | null => {
65
+ const markdownResult = processMarkdownSection(remainingText, chunks);
66
+ if (markdownResult.splitFound) return markdownResult;
67
+
68
+ const sectionBreakResult = processSectionBreaks(remainingText, chunks);
69
+ if (sectionBreakResult.splitFound) return sectionBreakResult;
70
+
71
+ return null;
72
+ };
73
+
74
+ /**
75
+ * Runs all split processors on the remaining text
76
+ */
77
+ const runSplitProcessors = (remainingText: string, chunks: string[]): SplitProcessorResult => {
78
+ const introResult = runIntroAndListProcessors(remainingText, chunks);
79
+ if (introResult !== null) return introResult;
80
+
81
+ const contentResult = runContentStructureProcessors(remainingText, chunks);
82
+ if (contentResult !== null) return contentResult;
83
+
84
+ const formatResult = runFormattingProcessors(remainingText, chunks);
85
+ if (formatResult !== null) return formatResult;
86
+
87
+ return { splitFound: false, newRemainingText: remainingText };
88
+ };
89
+
90
+ /**
91
+ * Runs question and period processors
92
+ */
93
+ const runQuestionAndPeriodProcessors = (remainingText: string, chunks: string[]): SplitProcessorResult => {
94
+ const questionResult = processQuestionMarks(remainingText, chunks);
95
+ if (questionResult.splitFound) {
96
+ return questionResult;
97
+ }
98
+
99
+ if (remainingText.length > PERIOD_SPLIT_TEXT_THRESHOLD) {
100
+ const periodResult = processPeriodSplits(remainingText, chunks);
101
+ if (periodResult.splitFound) {
102
+ return periodResult;
103
+ }
104
+ }
105
+
106
+ return { splitFound: false, newRemainingText: remainingText };
107
+ };
108
+
109
+ /** Pre-process text before splitting */
110
+ const preProcessText = (text: string): string => {
111
+ let processedText = removePeriodsAfterURLs(text);
112
+ processedText = normalizeInlineNumberedList(processedText);
113
+ processedText = normalizeInlineProductCardList(processedText);
114
+ return processedText;
115
+ };
116
+
117
+ /**
118
+ * Splits chat text into smaller chunks for better readability.
119
+ * Handles various patterns like lists, questions, markdown sections, etc.
120
+ */
121
+ export const splitChatText = (text: string | null | undefined): string[] => {
122
+ if (text === null || text === undefined || text.length === ZERO) {
123
+ return [];
124
+ }
125
+
126
+ const processedText = preProcessText(text);
127
+ const chunks: string[] = [];
128
+ let remainingText = processedText;
129
+
130
+ while (remainingText !== '') {
131
+ const { splitFound, newRemainingText } = runSplitProcessors(remainingText, chunks);
132
+
133
+ if (splitFound) {
134
+ remainingText = newRemainingText;
135
+ continue;
136
+ }
137
+
138
+ const { splitFound: qpSplitFound, newRemainingText: qpNewText } = runQuestionAndPeriodProcessors(
139
+ remainingText,
140
+ chunks
141
+ );
142
+
143
+ if (qpSplitFound) {
144
+ remainingText = qpNewText;
145
+ continue;
146
+ }
147
+
148
+ chunks.push(remainingText);
149
+ break;
150
+ }
151
+
152
+ const mergedChunks = mergeSmallChunks(chunks);
153
+ const normalizedChunks = mergedChunks.map((chunk) => normalizeSpanishPunctuation(chunk));
154
+
155
+ return normalizedChunks;
156
+ };
@@ -0,0 +1,2 @@
1
+ /** Short text threshold for period splitting */
2
+ export const PERIOD_SPLIT_TEXT_THRESHOLD = 100;
@@ -0,0 +1,153 @@
1
+ import {
2
+ FIRST_NEWLINE_SEARCH_LIMIT,
3
+ INDEX_OFFSET,
4
+ MAX_INTRO_LENGTH,
5
+ MAX_QUESTION_WITH_OPTIONS_LENGTH,
6
+ } from './constants.js';
7
+
8
+ /** Long paragraph threshold */
9
+ const LONG_PARAGRAPH_THRESHOLD = 150;
10
+
11
+ /** Minimum list items for question with options */
12
+ const MIN_LIST_ITEMS_FOR_OPTIONS = 2;
13
+
14
+ /** Constants */
15
+ const NOT_FOUND = -1;
16
+ const ZERO = 0;
17
+ const FIRST_MATCH = 1;
18
+ const SECOND_MATCH = 2;
19
+
20
+ /** Split result type */
21
+ export interface SplitResult {
22
+ splitFound: boolean;
23
+ newRemainingText: string;
24
+ }
25
+
26
+ /**
27
+ * Checks if text has a question with response options pattern
28
+ */
29
+ export const hasQuestionWithOptionsPattern = (text: string): boolean =>
30
+ /^[^?]+\?\s*\n+[\s\S]*?(?:Puedes responder con|puedes responder con):[\s\S]*?\n+-/v.test(text);
31
+
32
+ /** Check if line ends with colon */
33
+ const lineEndsWithColon = (line: string | undefined): boolean => line?.trim().endsWith(':') === true;
34
+
35
+ /** Find the last line ending with colon */
36
+ const findLastColonLineIndex = (lines: string[]): number => {
37
+ for (let i = lines.length - INDEX_OFFSET; i >= ZERO; i -= INDEX_OFFSET) {
38
+ if (lineEndsWithColon(lines[i])) {
39
+ return i;
40
+ }
41
+ }
42
+ return NOT_FOUND;
43
+ };
44
+
45
+ /** Extract intro from match groups */
46
+ const extractIntro = (match: RegExpExecArray): string => {
47
+ const { groups } = match;
48
+ const part1 = groups?.intro ?? match[FIRST_MATCH] ?? '';
49
+ const part2 = groups?.afterColon ?? match[SECOND_MATCH] ?? '';
50
+ return part1 + part2;
51
+ };
52
+
53
+ /** Adjust intro for response prompts */
54
+ const adjustIntroForResponsePrompt = (intro: string): string => {
55
+ if (!intro.includes('Puedes responder con:') && !intro.includes('puedes responder con:')) {
56
+ return intro;
57
+ }
58
+
59
+ const lines = intro.split('\n');
60
+ const lastColonIndex = findLastColonLineIndex(lines);
61
+
62
+ if (lastColonIndex >= ZERO) {
63
+ return lines.slice(ZERO, lastColonIndex + INDEX_OFFSET).join('\n');
64
+ }
65
+
66
+ return intro;
67
+ };
68
+
69
+ /**
70
+ * Processes intro with list pattern and returns split info
71
+ */
72
+ export const processIntroWithList = (remainingText: string, chunks: string[]): SplitResult => {
73
+ const match = /^(?<intro>.+?:)(?<afterColon>[^\n]*?)\n+(?<listStart>\d{1,2}\.\s+|[\-•]\s+)/v.exec(
74
+ remainingText
75
+ );
76
+
77
+ if (match === null) {
78
+ return { splitFound: false, newRemainingText: remainingText };
79
+ }
80
+
81
+ const intro = extractIntro(match);
82
+ const finalIntro = adjustIntroForResponsePrompt(intro);
83
+
84
+ if (finalIntro.length < MAX_INTRO_LENGTH) {
85
+ chunks.push(finalIntro);
86
+ return { splitFound: true, newRemainingText: remainingText.substring(finalIntro.length).trim() };
87
+ }
88
+
89
+ return { splitFound: false, newRemainingText: remainingText };
90
+ };
91
+
92
+ /**
93
+ * Processes question followed by numbered list (answer options)
94
+ */
95
+ export const processQuestionWithList = (remainingText: string, chunks: string[]): SplitResult => {
96
+ const match = /^(?<question>[\s\S]*?\?[^\n]*?)\n(?<list>\d{1,2}\.\s+[\s\S]*)/v.exec(remainingText);
97
+
98
+ if (match === null) {
99
+ return { splitFound: false, newRemainingText: remainingText };
100
+ }
101
+
102
+ const { groups } = match;
103
+ const listPart = groups?.list ?? match[SECOND_MATCH] ?? '';
104
+ const listLines = listPart.split('\n');
105
+ const allLinesAreListItems = listLines.every(
106
+ (line) => line.trim() === '' || /^\d{1,2}\.\s+/v.test(line.trim())
107
+ );
108
+ const { length: listItemCount } = listLines.filter((l) => /^\d{1,2}\.\s+/v.test(l.trim()));
109
+
110
+ const shouldProcess =
111
+ allLinesAreListItems &&
112
+ remainingText.length < MAX_QUESTION_WITH_OPTIONS_LENGTH &&
113
+ listItemCount >= MIN_LIST_ITEMS_FOR_OPTIONS;
114
+
115
+ if (shouldProcess) {
116
+ chunks.push(remainingText);
117
+ return { splitFound: true, newRemainingText: '' };
118
+ }
119
+
120
+ return { splitFound: false, newRemainingText: remainingText };
121
+ };
122
+
123
+ /** Check if intro ends with list pattern */
124
+ const endsWithListPattern = (intro: string): boolean =>
125
+ /\d{1,2}\.\s*$/v.exec(intro) !== null || /[\-•]\s*$/v.exec(intro) !== null;
126
+
127
+ /**
128
+ * Processes intro followed by long paragraphs
129
+ */
130
+ export const processIntroWithLongParagraphs = (remainingText: string, chunks: string[]): SplitResult => {
131
+ const firstNewline = remainingText.indexOf('\n');
132
+
133
+ if (firstNewline === NOT_FOUND || firstNewline >= FIRST_NEWLINE_SEARCH_LIMIT) {
134
+ return { splitFound: false, newRemainingText: remainingText };
135
+ }
136
+
137
+ const intro = remainingText.substring(ZERO, firstNewline).trim();
138
+
139
+ if (!intro.endsWith(':') || endsWithListPattern(intro)) {
140
+ return { splitFound: false, newRemainingText: remainingText };
141
+ }
142
+
143
+ const afterIntro = remainingText.substring(firstNewline + INDEX_OFFSET);
144
+ const [firstParagraphRaw] = afterIntro.split('\n');
145
+ const firstParagraph = (firstParagraphRaw ?? '').trim();
146
+
147
+ if (firstParagraph.length > LONG_PARAGRAPH_THRESHOLD) {
148
+ chunks.push(intro);
149
+ return { splitFound: true, newRemainingText: afterIntro };
150
+ }
151
+
152
+ return { splitFound: false, newRemainingText: remainingText };
153
+ };
@@ -0,0 +1,86 @@
1
+ /** Empty length constant */
2
+ const EMPTY_LENGTH = 0;
3
+
4
+ /** First character index */
5
+ const FIRST_INDEX = 0;
6
+
7
+ /**
8
+ * Smart trim that only removes whitespace but preserves emojis and other Unicode characters
9
+ */
10
+ export const smartTrim = (str: string): string =>
11
+ // Remove only ASCII whitespace characters and common Unicode spaces
12
+ str
13
+ .replace(/^[\s\u00A0\u1680\u2000-\u200B\u202F\u205F\u3000]+/v, '')
14
+ .replace(/[\s\u00A0\u1680\u2000-\u200B\u202F\u205F\u3000]+$/v, '');
15
+
16
+ /**
17
+ * Check if a string contains only whitespace and/or emojis (no actual text content)
18
+ */
19
+ export const hasTextContent = (str: string): boolean => {
20
+ const trimmed = smartTrim(str);
21
+ if (trimmed.length === EMPTY_LENGTH) return false;
22
+
23
+ // Check if the trimmed string contains any alphanumeric or punctuation characters
24
+ // If it only contains emojis/symbols, we don't consider it as having text content for splitting purposes
25
+ const hasAlphaNumeric = /[a-zA-Z0-9\u00C0-\u024F\u1E00-\u1EFF]/v.test(trimmed);
26
+ return hasAlphaNumeric;
27
+ };
28
+
29
+ /**
30
+ * Check if text after a question mark is just a parenthetical clarification
31
+ * that should stay with the question (e.g., "(XS, S o M)?")
32
+ */
33
+ export const isParentheticalClarification = (str: string): boolean => {
34
+ const trimmed = smartTrim(str);
35
+ if (trimmed.length === EMPTY_LENGTH) return false;
36
+
37
+ // Check if it starts with opening parenthesis and contains a closing question mark
38
+ // Pattern: (something)?
39
+ const parentheticalPattern = /^\([^\)]+\)\?/v;
40
+ return parentheticalPattern.test(trimmed);
41
+ };
42
+
43
+ /** Emoji pattern for detection with Unicode property escape */
44
+ const EMOJI_PATTERN = /\p{Emoji}/v;
45
+
46
+ /**
47
+ * Check if text starts with an emoji using Unicode property escapes
48
+ */
49
+ export const startsWithEmoji = (str: string): boolean => {
50
+ if (str === '' || str.length === EMPTY_LENGTH) return false;
51
+ const match = EMOJI_PATTERN.exec(str);
52
+ return match !== null && match.index === FIRST_INDEX;
53
+ };
54
+
55
+ /**
56
+ * Check if text starts with a lowercase letter (indicating continuation of sentence)
57
+ */
58
+ export const startsWithLowercase = (str: string): boolean => {
59
+ if (str === '' || str.length === EMPTY_LENGTH) return false;
60
+
61
+ // Find the first alphabetic character (skipping whitespace and symbols)
62
+ const firstLetterMatch = /[a-zA-ZáéíóúüñÁÉÍÓÚÜÑàèìòùÀÈÌÒÙâêîôûÂÊÎÔÛäëïöüÄËÏÖÜ]/v.exec(str);
63
+ if (firstLetterMatch === null) return false;
64
+
65
+ const [firstLetter] = firstLetterMatch;
66
+ // Check if it's lowercase
67
+ return firstLetter === firstLetter.toLowerCase() && firstLetter !== firstLetter.toUpperCase();
68
+ };
69
+
70
+ /**
71
+ * Find the position after the emoji (including any trailing spaces)
72
+ * Returns the index where actual text content starts
73
+ */
74
+ export const findPositionAfterEmoji = (str: string): number => {
75
+ if (str === '' || str.length === EMPTY_LENGTH) return FIRST_INDEX;
76
+
77
+ // Match emoji at the start using Unicode property escape, potentially followed by whitespace
78
+ const emojiWithSpacePattern = /^\p{Emoji}+\s*/v;
79
+ const match = emojiWithSpacePattern.exec(str);
80
+
81
+ if (match !== null) {
82
+ return match[FIRST_INDEX].length;
83
+ }
84
+
85
+ return FIRST_INDEX;
86
+ };
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Replaces periods immediately after URLs with line breaks.
3
+ * URLs never end with periods - the period is sentence punctuation that should be separated.
4
+ * Example: "Visit https://nike.com.co. We have products" → "Visit https://nike.com.co\nWe have products"
5
+ */
6
+ export const removePeriodsAfterURLs = (text: string): string => {
7
+ // Match URLs (http/https/www) - allow multiple periods in domain, but capture trailing period separately
8
+ // Pattern matches the entire URL (including all periods in the domain) followed by a period and space/end
9
+ const urlWithPeriodPattern = /(?<url>https?:\/\/[^\s]+?|www\.[^\s]+?)\.(?<after>\s|$)/gv;
10
+
11
+ return text.replace(
12
+ urlWithPeriodPattern,
13
+ (_match, url: string, afterPeriod: string) =>
14
+ // Replace the period with a line break, preserve whatever comes after (space or end)
15
+ `${url}\n${afterPeriod}`
16
+ );
17
+ };
@@ -0,0 +1 @@
1
+ export { splitChatText } from './chatSplit/index.js';
@@ -0,0 +1,4 @@
1
+ {
2
+ "extends": "./tsconfig.json",
3
+ "exclude": ["node_modules", "dist", "src/__tests__"]
4
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "https://json.schemastore.org/tsconfig",
3
+ "compilerOptions": {
4
+ "target": "ES2024",
5
+ "lib": ["ES2024"],
6
+ "module": "NodeNext",
7
+ "moduleResolution": "NodeNext",
8
+ "rootDir": "./src",
9
+ "outDir": "./dist",
10
+ "strict": true,
11
+ "noUncheckedIndexedAccess": true,
12
+ "esModuleInterop": true,
13
+ "forceConsistentCasingInFileNames": true,
14
+ "skipLibCheck": true,
15
+ "resolveJsonModule": true,
16
+ "isolatedModules": true,
17
+ "incremental": true,
18
+ "declaration": true,
19
+ "declarationMap": true,
20
+ "composite": true,
21
+ "types": ["node", "jest"]
22
+ },
23
+ "include": ["src/**/*.ts"],
24
+ "exclude": ["node_modules", "dist"]
25
+ }