@daviddh/llm-markdown-whatsapp 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.prettierrc +17 -0
  2. package/CLAUDE.md +155 -0
  3. package/README.md +304 -0
  4. package/eslint.config.mjs +28 -0
  5. package/jest.config.js +40 -0
  6. package/package.json +61 -0
  7. package/packages/core/dist/__tests__/splitChatText.basic.test.d.ts +2 -0
  8. package/packages/core/dist/__tests__/splitChatText.basic.test.d.ts.map +1 -0
  9. package/packages/core/dist/__tests__/splitChatText.basic.test.js +100 -0
  10. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.d.ts +2 -0
  11. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.d.ts.map +1 -0
  12. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.js +88 -0
  13. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.d.ts +2 -0
  14. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.d.ts.map +1 -0
  15. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.js +108 -0
  16. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.d.ts +2 -0
  17. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.d.ts.map +1 -0
  18. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.js +74 -0
  19. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.d.ts +2 -0
  20. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.d.ts.map +1 -0
  21. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.js +80 -0
  22. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.d.ts +2 -0
  23. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.d.ts.map +1 -0
  24. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.js +124 -0
  25. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.d.ts +2 -0
  26. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.d.ts.map +1 -0
  27. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.js +122 -0
  28. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.d.ts +2 -0
  29. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.d.ts.map +1 -0
  30. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.js +132 -0
  31. package/packages/core/dist/__tests__/splitChatText.helpers.d.ts +2 -0
  32. package/packages/core/dist/__tests__/splitChatText.helpers.d.ts.map +1 -0
  33. package/packages/core/dist/__tests__/splitChatText.helpers.js +5 -0
  34. package/packages/core/dist/__tests__/splitChatText.punctuation.test.d.ts +2 -0
  35. package/packages/core/dist/__tests__/splitChatText.punctuation.test.d.ts.map +1 -0
  36. package/packages/core/dist/__tests__/splitChatText.punctuation.test.js +98 -0
  37. package/packages/core/dist/__tests__/splitChatText.realWorld.test.d.ts +2 -0
  38. package/packages/core/dist/__tests__/splitChatText.realWorld.test.d.ts.map +1 -0
  39. package/packages/core/dist/__tests__/splitChatText.realWorld.test.js +104 -0
  40. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.d.ts +2 -0
  41. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.d.ts.map +1 -0
  42. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.js +82 -0
  43. package/packages/core/dist/__tests__/strs.splitChatText.test.d.ts +2 -0
  44. package/packages/core/dist/__tests__/strs.splitChatText.test.d.ts.map +1 -0
  45. package/packages/core/dist/__tests__/strs.splitChatText.test.js +992 -0
  46. package/packages/core/dist/chatSplit/breakProcessor.d.ts +4 -0
  47. package/packages/core/dist/chatSplit/breakProcessor.d.ts.map +1 -0
  48. package/packages/core/dist/chatSplit/breakProcessor.js +67 -0
  49. package/packages/core/dist/chatSplit/constants.d.ts +35 -0
  50. package/packages/core/dist/chatSplit/constants.d.ts.map +1 -0
  51. package/packages/core/dist/chatSplit/constants.js +34 -0
  52. package/packages/core/dist/chatSplit/index.d.ts +2 -0
  53. package/packages/core/dist/chatSplit/index.d.ts.map +1 -0
  54. package/packages/core/dist/chatSplit/index.js +1 -0
  55. package/packages/core/dist/chatSplit/listNormalization.d.ts +13 -0
  56. package/packages/core/dist/chatSplit/listNormalization.d.ts.map +1 -0
  57. package/packages/core/dist/chatSplit/listNormalization.js +140 -0
  58. package/packages/core/dist/chatSplit/listProcessor.d.ts +6 -0
  59. package/packages/core/dist/chatSplit/listProcessor.d.ts.map +1 -0
  60. package/packages/core/dist/chatSplit/listProcessor.js +61 -0
  61. package/packages/core/dist/chatSplit/mergeProcessor.d.ts +3 -0
  62. package/packages/core/dist/chatSplit/mergeProcessor.d.ts.map +1 -0
  63. package/packages/core/dist/chatSplit/mergeProcessor.js +88 -0
  64. package/packages/core/dist/chatSplit/paragraphProcessor.d.ts +14 -0
  65. package/packages/core/dist/chatSplit/paragraphProcessor.d.ts.map +1 -0
  66. package/packages/core/dist/chatSplit/paragraphProcessor.js +66 -0
  67. package/packages/core/dist/chatSplit/periodProcessor.d.ts +4 -0
  68. package/packages/core/dist/chatSplit/periodProcessor.d.ts.map +1 -0
  69. package/packages/core/dist/chatSplit/periodProcessor.js +110 -0
  70. package/packages/core/dist/chatSplit/positionHelpers.d.ts +12 -0
  71. package/packages/core/dist/chatSplit/positionHelpers.d.ts.map +1 -0
  72. package/packages/core/dist/chatSplit/positionHelpers.js +57 -0
  73. package/packages/core/dist/chatSplit/productCardProcessor.d.ts +12 -0
  74. package/packages/core/dist/chatSplit/productCardProcessor.d.ts.map +1 -0
  75. package/packages/core/dist/chatSplit/productCardProcessor.js +138 -0
  76. package/packages/core/dist/chatSplit/punctuationNormalization.d.ts +5 -0
  77. package/packages/core/dist/chatSplit/punctuationNormalization.d.ts.map +1 -0
  78. package/packages/core/dist/chatSplit/punctuationNormalization.js +103 -0
  79. package/packages/core/dist/chatSplit/questionProcessor.d.ts +6 -0
  80. package/packages/core/dist/chatSplit/questionProcessor.d.ts.map +1 -0
  81. package/packages/core/dist/chatSplit/questionProcessor.js +212 -0
  82. package/packages/core/dist/chatSplit/sections.d.ts +23 -0
  83. package/packages/core/dist/chatSplit/sections.d.ts.map +1 -0
  84. package/packages/core/dist/chatSplit/sections.js +153 -0
  85. package/packages/core/dist/chatSplit/splitChatText.d.ts +6 -0
  86. package/packages/core/dist/chatSplit/splitChatText.d.ts.map +1 -0
  87. package/packages/core/dist/chatSplit/splitChatText.js +119 -0
  88. package/packages/core/dist/chatSplit/splitConstants.d.ts +3 -0
  89. package/packages/core/dist/chatSplit/splitConstants.d.ts.map +1 -0
  90. package/packages/core/dist/chatSplit/splitConstants.js +2 -0
  91. package/packages/core/dist/chatSplit/splitProcessors.d.ts +22 -0
  92. package/packages/core/dist/chatSplit/splitProcessors.d.ts.map +1 -0
  93. package/packages/core/dist/chatSplit/splitProcessors.js +105 -0
  94. package/packages/core/dist/chatSplit/textHelpers.d.ts +27 -0
  95. package/packages/core/dist/chatSplit/textHelpers.d.ts.map +1 -0
  96. package/packages/core/dist/chatSplit/textHelpers.js +77 -0
  97. package/packages/core/dist/chatSplit/urlNormalization.d.ts +7 -0
  98. package/packages/core/dist/chatSplit/urlNormalization.d.ts.map +1 -0
  99. package/packages/core/dist/chatSplit/urlNormalization.js +13 -0
  100. package/packages/core/dist/index.d.ts +2 -0
  101. package/packages/core/dist/index.d.ts.map +1 -0
  102. package/packages/core/dist/index.js +1 -0
  103. package/packages/core/jest.config.js +23 -0
  104. package/packages/core/package.json +38 -0
  105. package/packages/core/src/__tests__/splitChatText.basic.test.ts +123 -0
  106. package/packages/core/src/__tests__/splitChatText.coverageLists.test.ts +108 -0
  107. package/packages/core/src/__tests__/splitChatText.coverageProcessors.test.ts +172 -0
  108. package/packages/core/src/__tests__/splitChatText.coverageQuestions.test.ts +95 -0
  109. package/packages/core/src/__tests__/splitChatText.dataProtection.test.ts +96 -0
  110. package/packages/core/src/__tests__/splitChatText.dataTests1.test.ts +137 -0
  111. package/packages/core/src/__tests__/splitChatText.dataTests2.test.ts +134 -0
  112. package/packages/core/src/__tests__/splitChatText.edgeCases.test.ts +157 -0
  113. package/packages/core/src/__tests__/splitChatText.helpers.ts +6 -0
  114. package/packages/core/src/__tests__/splitChatText.punctuation.test.ts +113 -0
  115. package/packages/core/src/__tests__/splitChatText.realWorld.test.ts +118 -0
  116. package/packages/core/src/__tests__/splitChatText.urlProtection.test.ts +102 -0
  117. package/packages/core/src/chatSplit/breakProcessor.ts +103 -0
  118. package/packages/core/src/chatSplit/constants.ts +50 -0
  119. package/packages/core/src/chatSplit/index.ts +1 -0
  120. package/packages/core/src/chatSplit/listNormalization.ts +189 -0
  121. package/packages/core/src/chatSplit/listProcessor.ts +74 -0
  122. package/packages/core/src/chatSplit/mergeProcessor.ts +124 -0
  123. package/packages/core/src/chatSplit/paragraphProcessor.ts +86 -0
  124. package/packages/core/src/chatSplit/periodProcessor.ts +148 -0
  125. package/packages/core/src/chatSplit/positionHelpers.ts +66 -0
  126. package/packages/core/src/chatSplit/productCardProcessor.ts +184 -0
  127. package/packages/core/src/chatSplit/punctuationNormalization.ts +142 -0
  128. package/packages/core/src/chatSplit/questionProcessor.ts +298 -0
  129. package/packages/core/src/chatSplit/sections.ts +243 -0
  130. package/packages/core/src/chatSplit/splitChatText.ts +156 -0
  131. package/packages/core/src/chatSplit/splitConstants.ts +2 -0
  132. package/packages/core/src/chatSplit/splitProcessors.ts +153 -0
  133. package/packages/core/src/chatSplit/textHelpers.ts +86 -0
  134. package/packages/core/src/chatSplit/urlNormalization.ts +17 -0
  135. package/packages/core/src/index.ts +1 -0
  136. package/packages/core/tsconfig.build.json +4 -0
  137. package/packages/core/tsconfig.json +25 -0
  138. package/tsconfig.json +19 -0
@@ -0,0 +1,189 @@
1
+ import { INDEX_OFFSET, MAX_LIST_NUMBER } from './constants.js';
2
+
3
+ /** Zero constant */
4
+ const ZERO = 0;
5
+
6
+ /** Not found constant for indexOf */
7
+ const NOT_FOUND = -1;
8
+
9
+ /** Match info for list item replacement */
10
+ interface ListMatchInfo {
11
+ match: string;
12
+ before: string;
13
+ num: string;
14
+ after: string;
15
+ offset: number;
16
+ }
17
+
18
+ /** Replacement info */
19
+ interface ReplacementInfo {
20
+ start: number;
21
+ end: number;
22
+ replacement: string;
23
+ }
24
+
25
+ /** Get replacement at index */
26
+ const getReplacementAt = (replacements: ReplacementInfo[], index: number): ReplacementInfo | undefined =>
27
+ replacements[index];
28
+
29
+ /** Check if text is already formatted */
30
+ const isAlreadyFormatted = (text: string): boolean => /\d{1,2}\.\s+[^\n]+\n\s*\d{1,2}\.\s+/v.test(text);
31
+
32
+ /** Check for inline list patterns */
33
+ const hasInlineListPatterns = (text: string): boolean => {
34
+ const afterColon = /:[^\n]*\d{1,2}\.\s+[^\n]+[ ]+\d{1,2}\.\s+/v.test(text);
35
+ const afterPunctuation = /[?!][^\n]*\s+1\.\s+[^\n]+[ ]+2\.\s+/v.test(text);
36
+ return afterColon || afterPunctuation;
37
+ };
38
+
39
+ /** Check if offset is preceded by a digit (version number pattern) */
40
+ const isPrecededByDigit = (text: string, offset: number): boolean => {
41
+ if (offset <= ZERO) return false;
42
+ const charBefore = text.charAt(offset - INDEX_OFFSET);
43
+ return /\d/v.test(charBefore);
44
+ };
45
+
46
+ /** Process a single list match and return replacement */
47
+ const processListMatch = (info: ListMatchInfo, text: string): string => {
48
+ const numValue = parseInt(info.num, 10);
49
+
50
+ if (numValue > MAX_LIST_NUMBER) return info.match;
51
+ if (isPrecededByDigit(text, info.offset)) return info.match;
52
+
53
+ if (info.num === '1' && info.before === ':') {
54
+ return `${info.before}\n${info.num}. ${info.after}`;
55
+ }
56
+
57
+ if (info.before.trim() === '' || info.before === ':') {
58
+ return `\n${info.num}. ${info.after}`;
59
+ }
60
+
61
+ return info.match;
62
+ };
63
+
64
+ /** Get string length safely */
65
+ const getStringLength = (str: string): number => str.length;
66
+
67
+ /** Create list match info from regex match */
68
+ const createListMatchInfo = (match: RegExpExecArray): ListMatchInfo => {
69
+ const { groups } = match;
70
+ const [firstMatch = ''] = match;
71
+ return {
72
+ match: firstMatch,
73
+ before: groups?.before ?? '',
74
+ num: groups?.num ?? '',
75
+ after: groups?.after ?? '',
76
+ offset: match.index,
77
+ };
78
+ };
79
+
80
+ /** Collect all replacements from text */
81
+ const collectReplacements = (text: string): ReplacementInfo[] => {
82
+ const pattern = /(?<before>[:\s?!])(?<num>\d{1,2})\.\s+(?<after>[^\n])/gv;
83
+ const replacements: ReplacementInfo[] = [];
84
+ let match = pattern.exec(text);
85
+
86
+ while (match !== null) {
87
+ const info = createListMatchInfo(match);
88
+ const replacement = processListMatch(info, text);
89
+ if (replacement !== info.match) {
90
+ const matchLength = getStringLength(info.match);
91
+ replacements.push({ start: match.index, end: match.index + matchLength, replacement });
92
+ }
93
+ match = pattern.exec(text);
94
+ }
95
+
96
+ return replacements;
97
+ };
98
+
99
+ /** Apply replacements in reverse order */
100
+ const applyReplacements = (text: string, replacements: ReplacementInfo[]): string => {
101
+ let result = text;
102
+ const { length: count } = replacements;
103
+
104
+ for (let i = count - INDEX_OFFSET; i >= ZERO; i -= INDEX_OFFSET) {
105
+ const r = getReplacementAt(replacements, i);
106
+ if (r !== undefined) {
107
+ result = result.substring(ZERO, r.start) + r.replacement + result.substring(r.end);
108
+ }
109
+ }
110
+
111
+ return result;
112
+ };
113
+
114
+ /**
115
+ * Detects if text contains an inline numbered list pattern like "1. X 2. Y 3. Z"
116
+ * and normalizes it by adding line breaks between items
117
+ * IMPORTANT: Only normalizes truly inline lists, preserves already-formatted lists
118
+ */
119
+ export const normalizeInlineNumberedList = (text: string): string => {
120
+ if (isAlreadyFormatted(text)) return text;
121
+ if (!hasInlineListPatterns(text)) return text;
122
+
123
+ const replacements = collectReplacements(text);
124
+ return applyReplacements(text, replacements);
125
+ };
126
+
127
+ /**
128
+ * Normalizes inline product card lists by adding line breaks.
129
+ * Detects patterns like "**1. 🛍️ Product** 💵 Price... **2. 🛍️ Product** 💵 Price..."
130
+ * and adds line breaks before each numbered item and emoji indicators.
131
+ */
132
+ export const normalizeInlineProductCardList = (text: string): string => {
133
+ // Check if we have inline product cards (numbered items with 🛍️ or markdown + emoji indicators on same line)
134
+ // Pattern 1: Multiple product cards - "1. 🛍️...2. 🛍️"
135
+ // Pattern 2: Single product card - "1. 🛍️" followed by emoji indicators without line breaks
136
+ const hasInlineProductCards =
137
+ /(?:\*{1,2})?\d+\.\s*🛍️[^\n]*\s+(?:\*{1,2})?\d+\.\s*🛍️/v.test(text) ||
138
+ /\d+\.\s+\*{1,2}[^*\n]+\*{1,2}\s+\p{Extended_Pictographic}[^\n]+\s+(?:\*{1,2})?\d+\.\s+/v.test(text) ||
139
+ /(?:\*{1,2})?\d+\.\s*🛍️[^\n]+\p{Extended_Pictographic}[^\n]+\p{Extended_Pictographic}/v.test(text);
140
+
141
+ if (!hasInlineProductCards) {
142
+ return text; // No inline product cards, return as-is
143
+ }
144
+
145
+ let result = text;
146
+
147
+ // Step 1: Add line break before numbered product card items (but not the first one)
148
+ // Pattern matches: "**2. 🛍️" or "*2. 🛍️" or "2. 🛍️" or "**2. *Title*" or "*2. *Title*"
149
+ // Must be preceded by a period, exclamation, or emoji from previous card
150
+ result = result.replace(/(?<punct>[.!✅])\s+(?<card>\*{0,2}\d+\.\s+(?:🛍️|\*{1,2}))/gv, '$<punct>\n$<card>');
151
+
152
+ // Step 2: Add line breaks before ANY emoji when not already on new line
153
+ // Only apply to content AFTER the first 🛍️ marker (to avoid affecting intro text)
154
+ // But NOT if they immediately follow the product card number pattern (to avoid breaking "1. 🛍️")
155
+ const firstProductMarker = result.indexOf('🛍️');
156
+ if (firstProductMarker !== NOT_FOUND) {
157
+ const beforeProducts = result.substring(ZERO, firstProductMarker);
158
+ const productsContent = result.substring(firstProductMarker);
159
+
160
+ const transformedProducts = productsContent.replace(
161
+ /(?<before>[^\n])\s+(?<emoji>\p{Extended_Pictographic})/gv,
162
+ (match, before: string, emoji: string) => {
163
+ // Don't add newline if the emoji follows a number and period (like "1. 🛍️")
164
+ if (/\d\.\s*$/v.test(before)) {
165
+ return match;
166
+ }
167
+ // Don't add newline if there's no actual space (protects emoji sequences)
168
+ if (!/\s/v.test(match)) {
169
+ return match;
170
+ }
171
+ return `${before}\n${emoji}`;
172
+ }
173
+ );
174
+
175
+ result = beforeProducts + transformedProducts;
176
+ }
177
+
178
+ // Step 3: Add line break before trailing questions (after product card content)
179
+ // Pattern: period/exclamation followed by space and question starting with ¿
180
+ // Keep the trailing emoji with the question
181
+ result = result.replace(
182
+ /(?<punctuation>[.!])\s+(?<question>¿[^\n?]*\?(?:\s*[^\s\n]+)?)$/mv,
183
+ (_match, punctuation: string, question: string) =>
184
+ // Check if question ends with an emoji - if so, keep it together
185
+ `${punctuation}\n${question.trim()}`
186
+ );
187
+
188
+ return result;
189
+ };
@@ -0,0 +1,74 @@
1
+ import { AVG_ITEM_LENGTH_THRESHOLD, MAX_ITEMS_FOR_LONG_SPLIT } from './constants.js';
2
+ import { findListSection } from './sections.js';
3
+ import type { SplitResult } from './splitProcessors.js';
4
+
5
+ /** Huge item length threshold */
6
+ const HUGE_ITEM_LENGTH = 150;
7
+
8
+ /** Zero constant for comparisons */
9
+ const ZERO = 0;
10
+
11
+ /**
12
+ * Processes numbered list items
13
+ */
14
+ function processNumberedList(listText: string, afterList: string, chunks: string[]): SplitResult {
15
+ const items = listText.split(/\n(?=\d{1,2}\.\s+)/v).filter((item) => item.trim().length > ZERO);
16
+
17
+ const hasHugeItems = items.some((item) => item.length > HUGE_ITEM_LENGTH);
18
+ const avgItemLength = items.reduce((sum, item) => sum + item.length, ZERO) / items.length;
19
+ const hasLongItems = avgItemLength > AVG_ITEM_LENGTH_THRESHOLD;
20
+
21
+ if (hasHugeItems || (hasLongItems && items.length <= MAX_ITEMS_FOR_LONG_SPLIT)) {
22
+ items.forEach((item) => {
23
+ const trimmedItem = item.trim();
24
+ if (trimmedItem.length > ZERO) {
25
+ chunks.push(trimmedItem);
26
+ }
27
+ });
28
+ } else {
29
+ chunks.push(listText.trim());
30
+ }
31
+
32
+ return { splitFound: true, newRemainingText: afterList.trim() };
33
+ }
34
+
35
+ /**
36
+ * Processes bullet list items
37
+ */
38
+ function processBulletList(listText: string, afterList: string, chunks: string[]): SplitResult {
39
+ const items = listText.split(/\n(?=[\-•]\s+)/v).filter((item) => item.trim().length > ZERO);
40
+ const hasHugeItems = items.some((item) => item.length > HUGE_ITEM_LENGTH);
41
+
42
+ if (hasHugeItems) {
43
+ items.forEach((item) => {
44
+ const trimmedItem = item.trim();
45
+ if (trimmedItem.length > ZERO) {
46
+ chunks.push(trimmedItem);
47
+ }
48
+ });
49
+ } else {
50
+ chunks.push(listText.trim());
51
+ }
52
+
53
+ return { splitFound: true, newRemainingText: afterList.trim() };
54
+ }
55
+
56
+ /**
57
+ * Processes list sections
58
+ */
59
+ export function processListSection(remainingText: string, chunks: string[]): SplitResult {
60
+ const listSection = findListSection(remainingText);
61
+
62
+ if (listSection === null) {
63
+ return { splitFound: false, newRemainingText: remainingText };
64
+ }
65
+
66
+ const listText = remainingText.substring(listSection.start, listSection.end);
67
+ const afterList = remainingText.substring(listSection.end);
68
+
69
+ if (listSection.type === 'numbered') {
70
+ return processNumberedList(listText, afterList, chunks);
71
+ }
72
+
73
+ return processBulletList(listText, afterList, chunks);
74
+ }
@@ -0,0 +1,124 @@
1
+ import { INDEX_OFFSET, MIN_CHUNK_SIZE } from './constants.js';
2
+
3
+ /** Long paragraph threshold */
4
+ const LONG_PARAGRAPH_THRESHOLD = 150;
5
+
6
+ /** Zero constant */
7
+ const ZERO = 0;
8
+
9
+ /** Get chunk at index safely */
10
+ const getChunkAt = (chunks: string[], index: number): string | undefined => chunks[index];
11
+
12
+ /** Check if chunk starts with question mark */
13
+ const startsWithQuestion = (chunk: string): boolean => chunk.trim().startsWith('¿');
14
+
15
+ /** Check if chunk starts with list */
16
+ const startsWithList = (chunk: string): boolean => /^(?:\d{1,2}\.\s+|[\-•]\s+)/v.test(chunk.trim());
17
+
18
+ /** Check if chunk ends with colon */
19
+ const endsWithColon = (chunk: string): boolean => chunk.trim().endsWith(':');
20
+
21
+ /** Check if chunk is a long paragraph */
22
+ const isLongParagraph = (chunk: string): boolean => chunk.trim().length > LONG_PARAGRAPH_THRESHOLD;
23
+
24
+ /** Checks if current chunk should be merged with next */
25
+ const shouldMergeWithNext = (
26
+ chunk: string,
27
+ nextChunk: string,
28
+ isLastChunk: boolean
29
+ ): { shouldMerge: boolean; skipToNext: boolean } => {
30
+ if (endsWithColon(chunk) && (startsWithList(nextChunk) || isLongParagraph(nextChunk))) {
31
+ return { shouldMerge: false, skipToNext: false };
32
+ }
33
+
34
+ if (endsWithColon(nextChunk)) {
35
+ return { shouldMerge: false, skipToNext: false };
36
+ }
37
+
38
+ const shouldMerge = chunk.trim().length < MIN_CHUNK_SIZE && !startsWithQuestion(nextChunk) && !isLastChunk;
39
+ return { shouldMerge, skipToNext: shouldMerge };
40
+ };
41
+
42
+ /** Handle last chunk merge with previous */
43
+ const handleLastChunkMerge = (chunk: string, mergedChunks: string[]): void => {
44
+ const previousChunk = mergedChunks.pop();
45
+ if (previousChunk !== undefined) {
46
+ mergedChunks.push(`${previousChunk} ${chunk.trim()}`);
47
+ }
48
+ };
49
+
50
+ /** Check if last chunk should merge with previous */
51
+ const shouldMergeLastWithPrevious = (chunk: string, mergedChunksLength: number): boolean =>
52
+ chunk.trim().length < MIN_CHUNK_SIZE && mergedChunksLength > ZERO && !startsWithQuestion(chunk);
53
+
54
+ /** Get current chunk from pending or working chunks */
55
+ const getCurrentChunk = (
56
+ pendingMerge: string | null,
57
+ workingChunks: string[],
58
+ index: number
59
+ ): string | undefined => {
60
+ if (pendingMerge !== null) return pendingMerge;
61
+ return getChunkAt(workingChunks, index);
62
+ };
63
+
64
+ /** Process last chunk with potential merge */
65
+ const processLastChunk = (chunk: string, mergedChunks: string[]): void => {
66
+ if (shouldMergeLastWithPrevious(chunk, mergedChunks.length)) {
67
+ handleLastChunkMerge(chunk, mergedChunks);
68
+ } else {
69
+ mergedChunks.push(chunk);
70
+ }
71
+ };
72
+
73
+ /** Process a non-last chunk and return pending merge if needed */
74
+ const processNonLastChunk = (
75
+ currentChunk: string,
76
+ nextChunk: string | undefined,
77
+ mergedChunks: string[]
78
+ ): string | null => {
79
+ if (nextChunk === undefined) {
80
+ mergedChunks.push(currentChunk);
81
+ return null;
82
+ }
83
+
84
+ const { shouldMerge, skipToNext } = shouldMergeWithNext(currentChunk, nextChunk, false);
85
+
86
+ if (shouldMerge && skipToNext) {
87
+ return `${currentChunk} ${nextChunk}`;
88
+ }
89
+
90
+ mergedChunks.push(currentChunk);
91
+ return null;
92
+ };
93
+
94
+ /** Merges small chunks with adjacent chunks */
95
+ export function mergeSmallChunks(chunks: string[]): string[] {
96
+ const mergedChunks: string[] = [];
97
+ const workingChunks = [...chunks];
98
+ let i = ZERO;
99
+ let pendingMerge: string | null = null;
100
+
101
+ while (i < workingChunks.length) {
102
+ const currentChunk = getCurrentChunk(pendingMerge, workingChunks, i);
103
+ pendingMerge = null;
104
+
105
+ if (currentChunk === undefined) {
106
+ i += INDEX_OFFSET;
107
+ continue;
108
+ }
109
+
110
+ const isLastChunk = i === workingChunks.length - INDEX_OFFSET;
111
+
112
+ if (isLastChunk) {
113
+ processLastChunk(currentChunk, mergedChunks);
114
+ i += INDEX_OFFSET;
115
+ continue;
116
+ }
117
+
118
+ const nextChunk = getChunkAt(workingChunks, i + INDEX_OFFSET);
119
+ pendingMerge = processNonLastChunk(currentChunk, nextChunk, mergedChunks);
120
+ i += INDEX_OFFSET;
121
+ }
122
+
123
+ return mergedChunks;
124
+ }
@@ -0,0 +1,86 @@
1
+ import { FIRST_NEWLINE_SEARCH_LIMIT, INDEX_OFFSET } from './constants.js';
2
+ import { findMarkdownSection } from './sections.js';
3
+ import { type SplitResult, hasQuestionWithOptionsPattern } from './splitProcessors.js';
4
+
5
+ /** Long paragraph threshold */
6
+ const LONG_PARAGRAPH_THRESHOLD = 150;
7
+
8
+ /** Minimum list items for question with options */
9
+ const MIN_LIST_ITEMS_FOR_OPTIONS = 2;
10
+
11
+ /** Zero constant for comparisons */
12
+ const ZERO = 0;
13
+
14
+ /**
15
+ * Processes long paragraphs after an intro
16
+ */
17
+ export function processLongParagraphsAfterIntro(remainingText: string, chunks: string[]): SplitResult {
18
+ const firstNewline = remainingText.indexOf('\n');
19
+ const NOT_FOUND = -1;
20
+
21
+ if (firstNewline === NOT_FOUND || firstNewline >= FIRST_NEWLINE_SEARCH_LIMIT) {
22
+ return { splitFound: false, newRemainingText: remainingText };
23
+ }
24
+
25
+ const firstLine = remainingText.substring(ZERO, firstNewline);
26
+ if (!firstLine.trim().endsWith(':')) {
27
+ return { splitFound: false, newRemainingText: remainingText };
28
+ }
29
+
30
+ const afterIntro = remainingText.substring(firstNewline + INDEX_OFFSET);
31
+ const paragraphs = afterIntro.split('\n').filter((p) => p.trim().length > ZERO);
32
+
33
+ if (
34
+ paragraphs.length >= MIN_LIST_ITEMS_FOR_OPTIONS &&
35
+ paragraphs.some((p) => p.length > LONG_PARAGRAPH_THRESHOLD)
36
+ ) {
37
+ chunks.push(firstLine.trim());
38
+ return { splitFound: true, newRemainingText: afterIntro };
39
+ }
40
+
41
+ return { splitFound: false, newRemainingText: remainingText };
42
+ }
43
+
44
+ /**
45
+ * Processes sequence of long paragraphs
46
+ */
47
+ export function processLongParagraphSequence(remainingText: string, chunks: string[]): SplitResult {
48
+ const lines = remainingText.split('\n');
49
+ const [firstLine] = lines;
50
+
51
+ if (lines.length < MIN_LIST_ITEMS_FOR_OPTIONS || firstLine === undefined) {
52
+ return { splitFound: false, newRemainingText: remainingText };
53
+ }
54
+
55
+ const firstParagraph = firstLine.trim();
56
+ if (firstParagraph.length <= LONG_PARAGRAPH_THRESHOLD) {
57
+ return { splitFound: false, newRemainingText: remainingText };
58
+ }
59
+
60
+ const afterFirstParagraph = lines.slice(INDEX_OFFSET).join('\n').trim();
61
+ const hasQuestionWithOptions = hasQuestionWithOptionsPattern(afterFirstParagraph);
62
+
63
+ if (!hasQuestionWithOptions) {
64
+ chunks.push(firstParagraph);
65
+ return { splitFound: true, newRemainingText: afterFirstParagraph };
66
+ }
67
+
68
+ return { splitFound: false, newRemainingText: remainingText };
69
+ }
70
+
71
+ /**
72
+ * Processes markdown sections
73
+ */
74
+ export function processMarkdownSection(remainingText: string, chunks: string[]): SplitResult {
75
+ const markdownSection = findMarkdownSection(remainingText);
76
+
77
+ if (markdownSection !== null) {
78
+ chunks.push(markdownSection.fullSection.trim());
79
+ return {
80
+ splitFound: true,
81
+ newRemainingText: remainingText.substring(markdownSection.fullSection.length).trim(),
82
+ };
83
+ }
84
+
85
+ return { splitFound: false, newRemainingText: remainingText };
86
+ }
@@ -0,0 +1,148 @@
1
+ import {
2
+ CURRENT_TEXT_SHORT_THRESHOLD,
3
+ INDEX_OFFSET,
4
+ SHORT_CHUNK_THRESHOLD,
5
+ SHORT_QUESTION_FRAGMENT_THRESHOLD,
6
+ } from './constants.js';
7
+ import { isPositionInsideParentheses } from './positionHelpers.js';
8
+ import type { SplitResult } from './splitProcessors.js';
9
+ import { smartTrim } from './textHelpers.js';
10
+
11
+ /** Protected range interface */
12
+ interface ProtectedRange {
13
+ start: number;
14
+ end: number;
15
+ }
16
+
17
+ /** Constants */
18
+ const ZERO = 0;
19
+ const INCREMENT = 1;
20
+
21
+ /** Add matches from pattern to protected ranges */
22
+ const addPatternMatches = (text: string, pattern: RegExp, ranges: ProtectedRange[]): void => {
23
+ const matches = text.matchAll(pattern);
24
+ for (const match of matches) {
25
+ const [matchedText] = match;
26
+ ranges.push({ start: match.index, end: match.index + matchedText.length });
27
+ }
28
+ };
29
+
30
+ /** Add period-ending matches to protected ranges */
31
+ const addPeriodEndingMatches = (text: string, pattern: RegExp, ranges: ProtectedRange[]): void => {
32
+ const matches = text.matchAll(pattern);
33
+ for (const match of matches) {
34
+ const [matchedText] = match;
35
+ const periodIndex = match.index + matchedText.length - INDEX_OFFSET;
36
+ ranges.push({ start: periodIndex, end: periodIndex + INDEX_OFFSET });
37
+ }
38
+ };
39
+
40
+ /** URL patterns */
41
+ const URL_PATTERN = /https?:\/\/[^\s]*[^\s.!?,;:]|www\.[^\s]*[^\s.!?,;:]/gv;
42
+ const PLAIN_DOMAIN_PATTERN =
43
+ /\b[a-zA-Z0-9][a-zA-Z0-9\-]*(?:\.[a-zA-Z0-9][a-zA-Z0-9\-]*)*\.(?:com|co|net|org|edu|gov|io|ai|app|dev|ly|me|tv|info|biz|tech|store|shop|online|site|web|blog|news|uk|ca|au|de|fr|es|it|nl|mx|ar|br|cl|pe|ve|uy|py|bo|gt|hn|sv|cr|pa|ni|do|cu|pr)(?:\.[a-z]{2,3})?\b/giv;
44
+ const EMAIL_PATTERN = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/gv;
45
+ const NUMBER_PATTERN = /\$?\d{1,3}(?:\.\d{3})+(?:\.\d+)?|\d+(?:\.\d+)+/gv;
46
+
47
+ /** Period-ending patterns */
48
+ const NUMBERED_LIST_PATTERN = /(?:^|\n)\s*\d+\./gv;
49
+ const ABBREVIATIONS_PATTERN = /\b(?:etc|e\.g|i\.e|dr|mr|mrs|ms|prof|sr|jr|inc|ltd|co|corp)\./giv;
50
+ const LOCATION_ABBR_PATTERN = /\b[A-Z]\.(?:[A-Z]\.)+/gv;
51
+ const BULLET_POINT_PATTERN = /(?:^|\n)\s*[\-•]\s+[^\n]+\./gmv;
52
+
53
+ /** Find URL and email protected ranges */
54
+ const findUrlEmailRanges = (text: string): ProtectedRange[] => {
55
+ const ranges: ProtectedRange[] = [];
56
+ addPatternMatches(text, URL_PATTERN, ranges);
57
+ addPatternMatches(text, PLAIN_DOMAIN_PATTERN, ranges);
58
+ addPatternMatches(text, EMAIL_PATTERN, ranges);
59
+ addPatternMatches(text, NUMBER_PATTERN, ranges);
60
+ return ranges;
61
+ };
62
+
63
+ /** Find list and abbreviation protected ranges */
64
+ const findListAbbreviationRanges = (text: string): ProtectedRange[] => {
65
+ const ranges: ProtectedRange[] = [];
66
+ addPeriodEndingMatches(text, NUMBERED_LIST_PATTERN, ranges);
67
+ addPeriodEndingMatches(text, ABBREVIATIONS_PATTERN, ranges);
68
+ addPatternMatches(text, LOCATION_ABBR_PATTERN, ranges);
69
+ addPeriodEndingMatches(text, BULLET_POINT_PATTERN, ranges);
70
+ return ranges;
71
+ };
72
+
73
+ /** Finds all protected ranges in text */
74
+ const findProtectedRanges = (text: string): ProtectedRange[] => [
75
+ ...findUrlEmailRanges(text),
76
+ ...findListAbbreviationRanges(text),
77
+ ];
78
+
79
+ /** Check if position is protected */
80
+ const isPositionProtected = (position: number, ranges: ProtectedRange[]): boolean =>
81
+ ranges.some((range) => position >= range.start && position < range.end);
82
+
83
+ /** Finds all valid period indices for splitting */
84
+ const findValidPeriodIndices = (text: string, protectedRanges: ProtectedRange[]): number[] => {
85
+ const periodIndices: number[] = [];
86
+
87
+ for (let i = ZERO; i < text.length; i += INCREMENT) {
88
+ if (text[i] !== '.') continue;
89
+ if (isPositionProtected(i, protectedRanges)) continue;
90
+ if (isPositionInsideParentheses(text, i)) continue;
91
+ periodIndices.push(i);
92
+ }
93
+
94
+ return periodIndices;
95
+ };
96
+
97
+ /** Check if period should be skipped due to short question after */
98
+ const shouldSkipForShortQuestion = (afterPeriod: string): boolean =>
99
+ afterPeriod.includes('?') && afterPeriod.length < SHORT_QUESTION_FRAGMENT_THRESHOLD;
100
+
101
+ /** Check if period should be skipped due to short chunks */
102
+ const shouldSkipForShortChunks = (
103
+ chunks: string[],
104
+ remainingTextLength: number,
105
+ afterPeriodLength: number
106
+ ): boolean => {
107
+ const lastChunkIndex = chunks.length - INDEX_OFFSET;
108
+ const lastChunk = lastChunkIndex >= ZERO ? chunks[lastChunkIndex] : undefined;
109
+
110
+ // Original logic: if no last chunk, lastChunkWasShort is false (not true)
111
+ const lastChunkWasShort = lastChunk !== undefined && lastChunk.trim().length < SHORT_CHUNK_THRESHOLD;
112
+ const currentTextIsShort = remainingTextLength < CURRENT_TEXT_SHORT_THRESHOLD;
113
+ const afterPeriodIsShort = afterPeriodLength < CURRENT_TEXT_SHORT_THRESHOLD;
114
+
115
+ return lastChunkWasShort && currentTextIsShort && afterPeriodIsShort;
116
+ };
117
+
118
+ /** Process a single period for potential split */
119
+ const processSinglePeriod = (
120
+ remainingText: string,
121
+ periodIndex: number,
122
+ chunks: string[]
123
+ ): SplitResult | null => {
124
+ if (periodIndex >= remainingText.length - INDEX_OFFSET) return null;
125
+
126
+ const afterPeriod = smartTrim(remainingText.substring(periodIndex + INDEX_OFFSET));
127
+ if (afterPeriod.length === ZERO) return null;
128
+
129
+ if (shouldSkipForShortQuestion(afterPeriod)) return null;
130
+ if (shouldSkipForShortChunks(chunks, remainingText.length, afterPeriod.length)) return null;
131
+
132
+ const beforePart = remainingText.substring(ZERO, periodIndex + INDEX_OFFSET);
133
+ chunks.push(beforePart);
134
+ return { splitFound: true, newRemainingText: afterPeriod };
135
+ };
136
+
137
+ /** Processes period splits */
138
+ export function processPeriodSplits(remainingText: string, chunks: string[]): SplitResult {
139
+ const protectedRanges = findProtectedRanges(remainingText);
140
+ const periodIndices = findValidPeriodIndices(remainingText, protectedRanges);
141
+
142
+ for (const periodIndex of periodIndices) {
143
+ const result = processSinglePeriod(remainingText, periodIndex, chunks);
144
+ if (result !== null) return result;
145
+ }
146
+
147
+ return { splitFound: false, newRemainingText: remainingText };
148
+ }
@@ -0,0 +1,66 @@
1
+ /** Constants */
2
+ const INCREMENT = 1;
3
+ const DECREMENT = 1;
4
+ const ZERO = 0;
5
+
6
+ /** Get character at position safely */
7
+ const getCharAt = (text: string, position: number): string | undefined => text[position];
8
+
9
+ /** Update open count based on character */
10
+ const updateOpenCount = (currentCount: number, char: string): number => {
11
+ if (char === '(') {
12
+ return currentCount + INCREMENT;
13
+ }
14
+ if (char === ')') {
15
+ const newCount = currentCount - DECREMENT;
16
+ return newCount < ZERO ? ZERO : newCount;
17
+ }
18
+ return currentCount;
19
+ };
20
+
21
+ /**
22
+ * Check if a position (index) in the text is within a bullet point line.
23
+ * This prevents splitting bullet lists at punctuation within bullet items.
24
+ */
25
+ export const isPositionInBulletLine = (text: string, position: number): boolean => {
26
+ if (position < ZERO || position >= text.length) return false;
27
+
28
+ let lineStart = position;
29
+ while (lineStart > ZERO && getCharAt(text, lineStart - DECREMENT) !== '\n') {
30
+ lineStart -= DECREMENT;
31
+ }
32
+
33
+ let lineEnd = position;
34
+ while (lineEnd < text.length && getCharAt(text, lineEnd) !== '\n') {
35
+ lineEnd += INCREMENT;
36
+ }
37
+
38
+ const line = text.substring(lineStart, lineEnd);
39
+ const trimmedLine = line.trim();
40
+
41
+ return /^[\-•]\s+/v.test(trimmedLine);
42
+ };
43
+
44
+ /**
45
+ * Count open parentheses up to a position
46
+ */
47
+ const countOpenParentheses = (text: string, position: number): number => {
48
+ let openCount = ZERO;
49
+
50
+ for (let i = ZERO; i < position; i += INCREMENT) {
51
+ const char = getCharAt(text, i) ?? '';
52
+ openCount = updateOpenCount(openCount, char);
53
+ }
54
+
55
+ return openCount;
56
+ };
57
+
58
+ /**
59
+ * Check if a position (index) in the text is within parentheses.
60
+ * This prevents splitting text inside parenthetical expressions.
61
+ * Returns true if the position is inside an unbalanced parenthetical expression.
62
+ */
63
+ export const isPositionInsideParentheses = (text: string, position: number): boolean => {
64
+ if (position < ZERO || position >= text.length) return false;
65
+ return countOpenParentheses(text, position) > ZERO;
66
+ };