@daviddh/llm-markdown-whatsapp 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.prettierrc +17 -0
  2. package/CLAUDE.md +155 -0
  3. package/README.md +304 -0
  4. package/eslint.config.mjs +28 -0
  5. package/jest.config.js +40 -0
  6. package/package.json +61 -0
  7. package/packages/core/dist/__tests__/splitChatText.basic.test.d.ts +2 -0
  8. package/packages/core/dist/__tests__/splitChatText.basic.test.d.ts.map +1 -0
  9. package/packages/core/dist/__tests__/splitChatText.basic.test.js +100 -0
  10. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.d.ts +2 -0
  11. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.d.ts.map +1 -0
  12. package/packages/core/dist/__tests__/splitChatText.coverageLists.test.js +88 -0
  13. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.d.ts +2 -0
  14. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.d.ts.map +1 -0
  15. package/packages/core/dist/__tests__/splitChatText.coverageProcessors.test.js +108 -0
  16. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.d.ts +2 -0
  17. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.d.ts.map +1 -0
  18. package/packages/core/dist/__tests__/splitChatText.coverageQuestions.test.js +74 -0
  19. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.d.ts +2 -0
  20. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.d.ts.map +1 -0
  21. package/packages/core/dist/__tests__/splitChatText.dataProtection.test.js +80 -0
  22. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.d.ts +2 -0
  23. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.d.ts.map +1 -0
  24. package/packages/core/dist/__tests__/splitChatText.dataTests1.test.js +124 -0
  25. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.d.ts +2 -0
  26. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.d.ts.map +1 -0
  27. package/packages/core/dist/__tests__/splitChatText.dataTests2.test.js +122 -0
  28. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.d.ts +2 -0
  29. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.d.ts.map +1 -0
  30. package/packages/core/dist/__tests__/splitChatText.edgeCases.test.js +132 -0
  31. package/packages/core/dist/__tests__/splitChatText.helpers.d.ts +2 -0
  32. package/packages/core/dist/__tests__/splitChatText.helpers.d.ts.map +1 -0
  33. package/packages/core/dist/__tests__/splitChatText.helpers.js +5 -0
  34. package/packages/core/dist/__tests__/splitChatText.punctuation.test.d.ts +2 -0
  35. package/packages/core/dist/__tests__/splitChatText.punctuation.test.d.ts.map +1 -0
  36. package/packages/core/dist/__tests__/splitChatText.punctuation.test.js +98 -0
  37. package/packages/core/dist/__tests__/splitChatText.realWorld.test.d.ts +2 -0
  38. package/packages/core/dist/__tests__/splitChatText.realWorld.test.d.ts.map +1 -0
  39. package/packages/core/dist/__tests__/splitChatText.realWorld.test.js +104 -0
  40. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.d.ts +2 -0
  41. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.d.ts.map +1 -0
  42. package/packages/core/dist/__tests__/splitChatText.urlProtection.test.js +82 -0
  43. package/packages/core/dist/__tests__/strs.splitChatText.test.d.ts +2 -0
  44. package/packages/core/dist/__tests__/strs.splitChatText.test.d.ts.map +1 -0
  45. package/packages/core/dist/__tests__/strs.splitChatText.test.js +992 -0
  46. package/packages/core/dist/chatSplit/breakProcessor.d.ts +4 -0
  47. package/packages/core/dist/chatSplit/breakProcessor.d.ts.map +1 -0
  48. package/packages/core/dist/chatSplit/breakProcessor.js +67 -0
  49. package/packages/core/dist/chatSplit/constants.d.ts +35 -0
  50. package/packages/core/dist/chatSplit/constants.d.ts.map +1 -0
  51. package/packages/core/dist/chatSplit/constants.js +34 -0
  52. package/packages/core/dist/chatSplit/index.d.ts +2 -0
  53. package/packages/core/dist/chatSplit/index.d.ts.map +1 -0
  54. package/packages/core/dist/chatSplit/index.js +1 -0
  55. package/packages/core/dist/chatSplit/listNormalization.d.ts +13 -0
  56. package/packages/core/dist/chatSplit/listNormalization.d.ts.map +1 -0
  57. package/packages/core/dist/chatSplit/listNormalization.js +140 -0
  58. package/packages/core/dist/chatSplit/listProcessor.d.ts +6 -0
  59. package/packages/core/dist/chatSplit/listProcessor.d.ts.map +1 -0
  60. package/packages/core/dist/chatSplit/listProcessor.js +61 -0
  61. package/packages/core/dist/chatSplit/mergeProcessor.d.ts +3 -0
  62. package/packages/core/dist/chatSplit/mergeProcessor.d.ts.map +1 -0
  63. package/packages/core/dist/chatSplit/mergeProcessor.js +88 -0
  64. package/packages/core/dist/chatSplit/paragraphProcessor.d.ts +14 -0
  65. package/packages/core/dist/chatSplit/paragraphProcessor.d.ts.map +1 -0
  66. package/packages/core/dist/chatSplit/paragraphProcessor.js +66 -0
  67. package/packages/core/dist/chatSplit/periodProcessor.d.ts +4 -0
  68. package/packages/core/dist/chatSplit/periodProcessor.d.ts.map +1 -0
  69. package/packages/core/dist/chatSplit/periodProcessor.js +110 -0
  70. package/packages/core/dist/chatSplit/positionHelpers.d.ts +12 -0
  71. package/packages/core/dist/chatSplit/positionHelpers.d.ts.map +1 -0
  72. package/packages/core/dist/chatSplit/positionHelpers.js +57 -0
  73. package/packages/core/dist/chatSplit/productCardProcessor.d.ts +12 -0
  74. package/packages/core/dist/chatSplit/productCardProcessor.d.ts.map +1 -0
  75. package/packages/core/dist/chatSplit/productCardProcessor.js +138 -0
  76. package/packages/core/dist/chatSplit/punctuationNormalization.d.ts +5 -0
  77. package/packages/core/dist/chatSplit/punctuationNormalization.d.ts.map +1 -0
  78. package/packages/core/dist/chatSplit/punctuationNormalization.js +103 -0
  79. package/packages/core/dist/chatSplit/questionProcessor.d.ts +6 -0
  80. package/packages/core/dist/chatSplit/questionProcessor.d.ts.map +1 -0
  81. package/packages/core/dist/chatSplit/questionProcessor.js +212 -0
  82. package/packages/core/dist/chatSplit/sections.d.ts +23 -0
  83. package/packages/core/dist/chatSplit/sections.d.ts.map +1 -0
  84. package/packages/core/dist/chatSplit/sections.js +153 -0
  85. package/packages/core/dist/chatSplit/splitChatText.d.ts +6 -0
  86. package/packages/core/dist/chatSplit/splitChatText.d.ts.map +1 -0
  87. package/packages/core/dist/chatSplit/splitChatText.js +119 -0
  88. package/packages/core/dist/chatSplit/splitConstants.d.ts +3 -0
  89. package/packages/core/dist/chatSplit/splitConstants.d.ts.map +1 -0
  90. package/packages/core/dist/chatSplit/splitConstants.js +2 -0
  91. package/packages/core/dist/chatSplit/splitProcessors.d.ts +22 -0
  92. package/packages/core/dist/chatSplit/splitProcessors.d.ts.map +1 -0
  93. package/packages/core/dist/chatSplit/splitProcessors.js +105 -0
  94. package/packages/core/dist/chatSplit/textHelpers.d.ts +27 -0
  95. package/packages/core/dist/chatSplit/textHelpers.d.ts.map +1 -0
  96. package/packages/core/dist/chatSplit/textHelpers.js +77 -0
  97. package/packages/core/dist/chatSplit/urlNormalization.d.ts +7 -0
  98. package/packages/core/dist/chatSplit/urlNormalization.d.ts.map +1 -0
  99. package/packages/core/dist/chatSplit/urlNormalization.js +13 -0
  100. package/packages/core/dist/index.d.ts +2 -0
  101. package/packages/core/dist/index.d.ts.map +1 -0
  102. package/packages/core/dist/index.js +1 -0
  103. package/packages/core/jest.config.js +23 -0
  104. package/packages/core/package.json +38 -0
  105. package/packages/core/src/__tests__/splitChatText.basic.test.ts +123 -0
  106. package/packages/core/src/__tests__/splitChatText.coverageLists.test.ts +108 -0
  107. package/packages/core/src/__tests__/splitChatText.coverageProcessors.test.ts +172 -0
  108. package/packages/core/src/__tests__/splitChatText.coverageQuestions.test.ts +95 -0
  109. package/packages/core/src/__tests__/splitChatText.dataProtection.test.ts +96 -0
  110. package/packages/core/src/__tests__/splitChatText.dataTests1.test.ts +137 -0
  111. package/packages/core/src/__tests__/splitChatText.dataTests2.test.ts +134 -0
  112. package/packages/core/src/__tests__/splitChatText.edgeCases.test.ts +157 -0
  113. package/packages/core/src/__tests__/splitChatText.helpers.ts +6 -0
  114. package/packages/core/src/__tests__/splitChatText.punctuation.test.ts +113 -0
  115. package/packages/core/src/__tests__/splitChatText.realWorld.test.ts +118 -0
  116. package/packages/core/src/__tests__/splitChatText.urlProtection.test.ts +102 -0
  117. package/packages/core/src/chatSplit/breakProcessor.ts +103 -0
  118. package/packages/core/src/chatSplit/constants.ts +50 -0
  119. package/packages/core/src/chatSplit/index.ts +1 -0
  120. package/packages/core/src/chatSplit/listNormalization.ts +189 -0
  121. package/packages/core/src/chatSplit/listProcessor.ts +74 -0
  122. package/packages/core/src/chatSplit/mergeProcessor.ts +124 -0
  123. package/packages/core/src/chatSplit/paragraphProcessor.ts +86 -0
  124. package/packages/core/src/chatSplit/periodProcessor.ts +148 -0
  125. package/packages/core/src/chatSplit/positionHelpers.ts +66 -0
  126. package/packages/core/src/chatSplit/productCardProcessor.ts +184 -0
  127. package/packages/core/src/chatSplit/punctuationNormalization.ts +142 -0
  128. package/packages/core/src/chatSplit/questionProcessor.ts +298 -0
  129. package/packages/core/src/chatSplit/sections.ts +243 -0
  130. package/packages/core/src/chatSplit/splitChatText.ts +156 -0
  131. package/packages/core/src/chatSplit/splitConstants.ts +2 -0
  132. package/packages/core/src/chatSplit/splitProcessors.ts +153 -0
  133. package/packages/core/src/chatSplit/textHelpers.ts +86 -0
  134. package/packages/core/src/chatSplit/urlNormalization.ts +17 -0
  135. package/packages/core/src/index.ts +1 -0
  136. package/packages/core/tsconfig.build.json +4 -0
  137. package/packages/core/tsconfig.json +25 -0
  138. package/tsconfig.json +19 -0
package/.prettierrc ADDED
@@ -0,0 +1,17 @@
1
+ {
2
+ "plugins": ["@trivago/prettier-plugin-sort-imports"],
3
+ "useTabs": false,
4
+ "tabWidth": 2,
5
+ "singleQuote": true,
6
+ "printWidth": 110,
7
+ "trailingComma": "es5",
8
+ "importOrder": [
9
+ "<THIRD_PARTY_MODULES>",
10
+ "^@globalUtils/(.*)$",
11
+ "^@src/(.*)$",
12
+ "^@globalTypes/(.*)$",
13
+ "^[./]"
14
+ ],
15
+ "importOrderSeparation": true,
16
+ "importOrderSortSpecifiers": true
17
+ }
package/CLAUDE.md ADDED
@@ -0,0 +1,155 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ TypeScript monorepo that splits LLM-generated markdown text into WhatsApp-friendly chat message chunks. The core algorithm intelligently breaks long text at natural boundaries (questions, periods, lists, markdown sections) while preserving URLs, numbers, emails, abbreviations, parenthetical expressions, and Spanish punctuation.
8
+
9
+ The primary use case is Latin American e-commerce customer service over WhatsApp, where LLMs generate long Spanish responses about products (Nike shoes, clothing, etc.) that need to be split into readable chat messages.
10
+
11
+ ## Commands
12
+
13
+ ```bash
14
+ npm install # Install all workspace dependencies
15
+ npm run build # Build all packages
16
+ npm run build:core # Build core package only
17
+ npm test # Run all tests
18
+ npm run test:core # Run core tests only
19
+ npm run typecheck # Type check all packages (tsc -b)
20
+ npm run lint # ESLint
21
+ npm run format # Prettier
22
+ npm run check # Format + lint + typecheck
23
+
24
+ # Run a single test file
25
+ cd packages/core && NODE_OPTIONS='--experimental-vm-modules' npx jest --testPathPattern="splitChatText"
26
+
27
+ # Watch mode for core tests
28
+ cd packages/core && NODE_OPTIONS='--experimental-vm-modules' npx jest --watch
29
+ ```
30
+
31
+ Note: `NODE_OPTIONS='--experimental-vm-modules'` is required because the project uses ESM modules with ts-jest.
32
+
33
+ ## Architecture
34
+
35
+ **Monorepo structure:** npm workspaces with `packages/*`. Currently only `packages/core` (`@llm-markdown-whatsapp/core`) exists. The root `tsconfig.json` references additional packages (redis, e2e/*) that are not yet present.
36
+
37
+ **Core package entry point:** `packages/core/src/index.ts` re-exports `splitChatText` from `packages/core/src/chatSplit/index.ts`, which re-exports from `splitChatText.ts`. This is the single public API function.
38
+
39
+ ### splitChatText Pipeline
40
+
41
+ `splitChatText(text)` in `packages/core/src/chatSplit/splitChatText.ts` is the orchestrator. It accepts `string | null | undefined` and returns `string[]`.
42
+
43
+ **1. Pre-processing** (`preProcessText`):
44
+ - `removePeriodsAfterURLs` (`urlNormalization.ts`): Replaces `.` after URLs with `\n` (URLs never end with periods)
45
+ - `normalizeInlineNumberedList` (`listNormalization.ts`): Detects inline patterns like `1. X 2. Y 3. Z` and adds line breaks between items. Skips already-formatted lists. Handles both colon-preceding and question-preceding patterns.
46
+ - `normalizeInlineProductCardList` (`listNormalization.ts`): Detects inline product cards (with `🛍️` or markdown formatting + emoji indicators) and adds line breaks before each card, before emoji indicators within cards, and before trailing questions.
47
+
48
+ **2. Main loop** — iterates while `remainingText !== ''`, trying processors in priority order. First match wins, remaining text is re-evaluated from the top.
49
+
50
+ Processor groups (in `splitChatText.ts`):
51
+
52
+ - **`runIntroAndListProcessors`** (highest priority):
53
+ - `processIntroWithList` (`splitProcessors.ts`): Matches `intro:` + newline + list start (`\d. ` or `- `). If intro < 150 chars, splits after intro. Handles "Puedes responder con:" pattern specially.
54
+ - `processQuestionWithList` (`splitProcessors.ts`): Matches `question?\n` + numbered list. Keeps together as one chunk if total < 250 chars and >= 2 list items.
55
+ - `processIntroWithLongParagraphs` (`splitProcessors.ts`): Matches `intro:\n` + paragraph > 150 chars. Splits after intro.
56
+
57
+ - **`runContentStructureProcessors`**:
58
+ - `processProductCardLists` (`productCardProcessor.ts`): Detects product cards by emoji pattern (`\d. 🛍️`) or markdown pattern (`\d. *Title*` + emoji indicators). Extracts intro, splits each card into its own chunk (removing the `\d.` prefix), and separates trailing questions from the last card via `extractTrailingQuestion`.
59
+ - `processListSection` (`listProcessor.ts`): Uses `findListSection` (`sections.ts`) to detect numbered or bullet lists. Splits numbered lists per-item if items > 150 chars or average > 70 chars with <= 3 items. Splits bullet lists per-item only if items > 150 chars. Otherwise keeps the entire list as one chunk.
60
+ - `processLongParagraphsAfterIntro` (`paragraphProcessor.ts`): `intro:\n` followed by multiple paragraphs where at least one > 150 chars. Splits after intro.
61
+ - `processLongParagraphSequence` (`paragraphProcessor.ts`): First paragraph > 150 chars with multiple lines. Splits after first paragraph (unless followed by "question with options" pattern).
62
+
63
+ - **`runFormattingProcessors`**:
64
+ - `processMarkdownSection` (`paragraphProcessor.ts`): Uses `findMarkdownSection` (`sections.ts`) to detect `*Header*` or `_Header_` followed by content until next `\n\n` or end. Splits at section boundaries.
65
+ - `processSectionBreaks` (`breakProcessor.ts`): Splits at `\n\n` (double newline) if > 50 chars before the break. Does NOT split if: before ends with `?` and after has short intro + bullets, or before ends with "Puedes responder con:" + bullets, or after has question-with-options pattern. Also checks for markdown headers after break and long paragraphs before break.
66
+
67
+ - **`runQuestionAndPeriodProcessors`** (fallback):
68
+ - `processQuestionMarks` (`questionProcessor.ts`): Finds all valid `?` positions (excluding those inside bullet lines, parentheses, or before response options). If multiple questions are "contiguous" (no `.` between them, < 50 chars gap), groups them and splits after the last `?`. For single questions: long questions (> 100 chars) split directly; short questions combine with the next sentence if combined length <= 110 chars. Handles emoji-after-question by keeping emoji with the question chunk. Does NOT split if followed by lowercase (sentence continuation).
69
+ - `processPeriodSplits` (`periodProcessor.ts`): Only runs if remaining text > 100 chars. Builds protected ranges (URLs, domains, emails, numbers, abbreviations, bullet points, location abbreviations like `D.C.`) and finds valid `.` positions. Skips if after-period text has a short question (< 35 chars), or if last chunk was short (< 50 chars) + current text is short (< 150 chars) + after-period is short (< 150 chars).
70
+
71
+ **3. Post-processing**:
72
+ - `mergeSmallChunks` (`mergeProcessor.ts`): Chunks < 20 chars merge with the next chunk. Last small chunk merges backward with previous. Respects boundaries: does not merge if current ends with `:` and next is a list or long paragraph, or if next chunk ends with `:`. Does not merge if next starts with `¿`.
73
+ - `normalizeSpanishPunctuation` (`punctuationNormalization.ts`): For `¿` and `¡` marks: if mid-sentence (not at string start, not after `.`/`!`/`?`, not after line break), lowercases the following letter. This fixes LLM-generated text like `ayudarte ¿Cómo estás?` → `ayudarte ¿cómo estás?`.
74
+
75
+ ### Key Design Patterns
76
+
77
+ - **Processor chain:** Each processor returns `{ splitFound: boolean, newRemainingText: string }`. The main loop tries processors in priority order; the first match wins, text is re-evaluated from the top.
78
+ - **Protected ranges:** `periodProcessor.ts` builds protected ranges (URLs, emails, domains, numbers, abbreviations, bullet points) to prevent splitting inside them. Ranges are `{ start, end }` intervals. Position is protected if it falls within any range.
79
+ - **Position helpers:** `positionHelpers.ts` — `isPositionInsideParentheses` counts open parens before position; `isPositionInBulletLine` checks if position is on a line starting with `- ` or `• `.
80
+ - **Text helpers:** `textHelpers.ts` — `smartTrim` removes Unicode whitespace while preserving emojis; `hasTextContent` checks for alphanumeric (not just emojis/symbols); `startsWithEmoji` / `startsWithLowercase` / `findPositionAfterEmoji` for question splitting logic; `isParentheticalClarification` detects `(something)?` patterns.
81
+ - **Section detection:** `sections.ts` — `findMarkdownSection` detects `*Header*\n` or `_Header_\n` sections; `findListSection` detects numbered and bullet lists by walking lines with state machines (`NumberedListState`, `BulletListState`).
82
+ - **Constants centralized:** Thresholds in `constants.ts` — `MIN_CHUNK_SIZE` (20), `MAX_INTRO_LENGTH` (150), `MAX_QUESTION_WITH_OPTIONS_LENGTH` (250), `SHORT_INTRO_THRESHOLD` (50), `LONG_QUESTION_THRESHOLD` (100), `COMBINED_LENGTH_THRESHOLD` (110), `SHORT_QUESTION_FRAGMENT_THRESHOLD` (35), `MIN_CONTENT_BEFORE_BREAK` (50), `SHORT_CHUNK_THRESHOLD` (50), `CURRENT_TEXT_SHORT_THRESHOLD` (150), `AVG_ITEM_LENGTH_THRESHOLD` (70), `MAX_ITEMS_FOR_LONG_SPLIT` (3), `MAX_LIST_NUMBER` (20), `FIRST_NEWLINE_SEARCH_LIMIT` (100), `DOUBLE_NEWLINE_DISTANCE_THRESHOLD` (5). Also `splitConstants.ts` — `PERIOD_SPLIT_TEXT_THRESHOLD` (100). Several files also define local constants like `LONG_PARAGRAPH_THRESHOLD` (150).
83
+
84
+ ### File Map
85
+
86
+ ```
87
+ packages/core/src/
88
+ ├── index.ts # Public API re-export
89
+ ├── chatSplit/
90
+ │ ├── index.ts # Re-exports splitChatText
91
+ │ ├── splitChatText.ts # Main orchestrator: pre-process → processor loop → post-process
92
+ │ ├── splitProcessors.ts # Intro+list, question+list, intro+long-paragraphs processors
93
+ │ ├── productCardProcessor.ts # Product card detection (🛍️ emoji or *Title* markdown patterns)
94
+ │ ├── listProcessor.ts # Numbered/bullet list chunking (per-item if items are huge)
95
+ │ ├── paragraphProcessor.ts # Long paragraph sequences, markdown section detection
96
+ │ ├── breakProcessor.ts # Double newline (section break) splitting
97
+ │ ├── questionProcessor.ts # Question mark splitting with contiguous question grouping
98
+ │ ├── periodProcessor.ts # Period splitting with protected ranges (URLs, emails, numbers, etc.)
99
+ │ ├── mergeProcessor.ts # Post-processing: merge chunks < 20 chars with neighbors
100
+ │ ├── sections.ts # Markdown section and list section boundary detection (state machines)
101
+ │ ├── textHelpers.ts # smartTrim, hasTextContent, emoji detection, lowercase detection
102
+ │ ├── positionHelpers.ts # Parentheses depth counting, bullet line detection
103
+ │ ├── listNormalization.ts # Pre-processing: inline numbered list and product card normalization
104
+ │ ├── urlNormalization.ts # Pre-processing: remove periods after URLs
105
+ │ ├── punctuationNormalization.ts # Post-processing: Spanish ¿/¡ capitalization normalization
106
+ │ ├── constants.ts # All threshold constants (centralized)
107
+ │ └── splitConstants.ts # PERIOD_SPLIT_TEXT_THRESHOLD constant
108
+ └── __tests__/
109
+ └── strs.splitChatText.test.ts # 40+ scenario-based tests with exact chunk matching
110
+ ```
111
+
112
+ ### Shared Interface
113
+
114
+ All processors share the `SplitResult` interface (defined in `splitProcessors.ts`):
115
+ ```typescript
116
+ interface SplitResult {
117
+ splitFound: boolean;
118
+ newRemainingText: string;
119
+ }
120
+ ```
121
+
122
+ ## Code Style & Rules
123
+
124
+ - **ESLint config** (`eslint.config.mjs`): `eslint-config-love` + `typescript-eslint` recommended + strict custom rules:
125
+ - `max-lines-per-function: 40` (skip blanks/comments)
126
+ - `max-depth: 2`
127
+ - `max-lines: 300` per file
128
+ - `curly: multi-line`
129
+ - When hitting max-lines/max-lines-per-function, extract helper functions or split into separate files. Never compress statements onto single lines.
130
+ - Never use `any` type — always use explicit TypeScript types.
131
+ - Never disable ESLint rules (no eslint-disable comments or config modifications).
132
+ - **Prettier** (`.prettierrc`): single quotes, 110 print width, trailing commas (es5), 2-space tabs, import sorting via `@trivago/prettier-plugin-sort-imports`.
133
+ - **Import order** (enforced by Prettier plugin): third-party modules → `@globalUtils/*` → `@src/*` → `@globalTypes/*` → relative imports (with blank line separation).
134
+ - **ESM throughout:** `"type": "module"` in all package.json files. Imports use `.js` extensions (TypeScript ESM convention).
135
+ - **Regex:** Uses the `v` flag (Unicode sets) consistently across all regex patterns.
136
+ - **Constants style:** Numeric constants are extracted as named `const` variables (`ZERO`, `NOT_FOUND`, `INDEX_OFFSET`, `INCREMENT`, etc.) throughout all files. New code should follow this pattern.
137
+
138
+ ## Testing
139
+
140
+ Tests are in `packages/core/src/__tests__/strs.splitChatText.test.ts`. The test suite has 40+ scenario-based tests organized in `describe` blocks:
141
+ - Basic question splitting, contiguous questions, period splitting, smart question-period combination
142
+ - URL/link protection, number/price protection, email protection
143
+ - Edge cases (empty input, null/undefined, emojis, markdown formatting, abbreviations, version numbers)
144
+ - Real-world scenarios (22 numbered tests from actual WhatsApp conversations in Spanish)
145
+ - Spanish punctuation normalization (¿ and ¡ capitalization)
146
+ - Parentheses protection
147
+
148
+ Tests verify chunk boundaries exactly with `toEqual`. Some tests use structural assertions (`toContain`, `toBe(true/false)`) for URL/domain integrity checks. Jest is configured with `ts-jest` ESM preset and `--experimental-vm-modules`.
149
+
150
+ ## TypeScript Config
151
+
152
+ - Target: ES2024, Module: NodeNext, moduleResolution: NodeNext
153
+ - `strict: true`, `noUncheckedIndexedAccess: true`, `isolatedModules: true`
154
+ - Build uses `tsconfig.build.json` (extends `tsconfig.json`, excludes `__tests__/`) + `tsc-alias` for path alias resolution
155
+ - Path aliases configured in root `jest.config.js`: `@globalUtils/*`, `@src/*`, `@globalTypes/*` (though core package doesn't currently use path aliases)
package/README.md ADDED
@@ -0,0 +1,304 @@
1
+ <p align="center">
2
+ <h1 align="center">LLM Markdown WhatsApp</h1>
3
+ <p align="center">
4
+ A TypeScript library that splits LLM-generated markdown into WhatsApp-friendly chat message chunks.
5
+ </p>
6
+ </p>
7
+
8
+ <div align="center">
9
+
10
+ [![TypeScript](https://img.shields.io/badge/TypeScript-5.0+-3178C6?style=flat-square&logo=typescript&logoColor=white)](https://www.typescriptlang.org/)
11
+ [![Node.js](https://img.shields.io/badge/Node.js-18+-339933?style=flat-square&logo=node.js&logoColor=white)](https://nodejs.org/)
12
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow?style=flat-square)](https://opensource.org/licenses/MIT)
13
+
14
+ </div>
15
+
16
+ <p align="center">
17
+ <a href="#the-problem">Problem</a> •
18
+ <a href="#quickstart">Quickstart</a> •
19
+ <a href="#features">Features</a> •
20
+ <a href="#splitting-rules">Splitting Rules</a> •
21
+ <a href="#api-reference">API Reference</a>
22
+ </p>
23
+
24
+ ---
25
+
26
+ ## The Problem
27
+
28
+ LLMs generate long, structured markdown responses—paragraphs, numbered lists, product cards, nested bullet points. Sending these as a single WhatsApp message creates a wall of text that users won't read.
29
+
30
+ Naively splitting at character limits breaks mid-sentence, mid-list, or mid-URL. Splitting at every period creates fragmented messages that feel robotic. Neither approach understands the structure of the content.
31
+
32
+ Additionally:
33
+
34
+ - **URLs, emails, and numbers contain periods.** Splitting at `Nike.com.co` or `$1.000.000` or `juan.perez@gmail.com` produces broken fragments.
35
+ - **Lists should stay together.** A numbered list of products or a bullet list of options is a single logical unit—splitting inside an item destroys readability.
36
+ - **Questions need context.** A short trailing question like "¿Te interesa?" should stay attached to the preceding sentence, not become its own tiny message.
37
+
38
+ This library handles all of this. One function call, zero configuration. Pass in the LLM's markdown output, get back an array of WhatsApp-ready message chunks.
39
+
40
+ ## How It Works
41
+
42
+ The library takes a markdown string and splits it into an array of smaller chunks optimized for chat readability. It applies a priority-ordered chain of processors:
43
+
44
+ 1. Pre-processes text (normalizes inline lists, removes periods after URLs)
45
+ 2. Tries structural splits first (intro + list, product cards, markdown sections, double newlines)
46
+ 3. Falls back to semantic splits (question marks, periods) with intelligent protection
47
+ 4. Merges chunks that are too small (<20 chars) with their neighbors
48
+ 5. Normalizes Spanish punctuation (¿/¡ capitalization rules)
49
+
50
+ ## Features
51
+
52
+ | Feature | Description |
53
+ | ----------------------------- | ------------------------------------------------------------------------------ |
54
+ | **Smart Question Splitting** | Splits at question marks while keeping contiguous questions together |
55
+ | **List Preservation** | Keeps numbered and bullet lists intact, splits only when items are very long |
56
+ | **Product Card Detection** | Recognizes product card patterns (with emojis or markdown) and splits per card |
57
+ | **URL/Email/Number Safety** | Never splits inside URLs, emails, domain names, or formatted numbers |
58
+ | **Parentheses Protection** | Avoids splitting inside parenthetical expressions |
59
+ | **Abbreviation Awareness** | Protects periods in `etc.`, `Dr.`, `D.C.`, `S.A.`, version numbers |
60
+ | **Spanish Punctuation** | Normalizes capitalization after mid-sentence ¿ and ¡ marks |
61
+ | **Small Chunk Merging** | Prevents tiny fragments by merging small chunks with adjacent ones |
62
+ | **Markdown Section Support** | Splits at markdown headers (`*Title*` or `_Title_`) as natural boundaries |
63
+ | **Zero Configuration** | Single function, no setup required—just pass text, get chunks |
64
+
65
+ ## Quickstart
66
+
67
+ ```bash
68
+ npm install @llm-markdown-whatsapp/core
69
+ ```
70
+
71
+ ### Basic Usage
72
+
73
+ ```typescript
74
+ import { splitChatText } from '@llm-markdown-whatsapp/core';
75
+
76
+ const llmResponse = 'Thanks for reaching out. I understand your situation and I want to help you resolve it in the best way possible. You can send your product back at no extra cost. Would you prefer a full refund or an exchange for a different model?';
77
+
78
+ const chunks = splitChatText(llmResponse);
79
+ console.log(chunks);
80
+ // [
81
+ // 'Thanks for reaching out.',
82
+ // 'I understand your situation and I want to help you resolve it in the best way possible.',
83
+ // 'You can send your product back at no extra cost.',
84
+ // 'Would you prefer a full refund or an exchange for a different model?',
85
+ // ]
86
+ ```
87
+
88
+ ### Lists Stay Together
89
+
90
+ ```typescript
91
+ const llmResponse = `I found these options:
92
+
93
+ - Nike Pegasus Plus – High-performance running shoes for marathons and daily runs, featuring ZoomX Foam cushioning and a Flyknit upper that adapts to your foot. Available in black and a multicolor combination.
94
+ - Nike Air Max 90 – Classic model with a waffle sole and the iconic visible Air cushioning, in neutral tones like light bone/olive/university grey.
95
+ Which of these models interests you the most? 😊`;
96
+
97
+ const chunks = splitChatText(llmResponse);
98
+ // [
99
+ // 'I found these options:',
100
+ // '- Nike Pegasus Plus – High-performance running shoes for marathons...',
101
+ // '- Nike Air Max 90 – Classic model with a waffle sole...',
102
+ // 'Which of these models interests you the most? 😊',
103
+ // ]
104
+ ```
105
+
106
+ ### Product Cards Split Per Card
107
+
108
+ ```typescript
109
+ const llmResponse = `I found these options:
110
+
111
+ 1. 🛍️ Pegasus Plus Shoes: 💵 $1.015.000
112
+ 📏 Color: Black, Glacier Blue/Mint Foam/Impact Green/Black.
113
+ 📏 Shoe Size: 43, 41, 38.
114
+ ✅ Ultra-lightweight, with ZoomX cushioning and great breathability.
115
+
116
+ 2. 🛍️ ISPA Sense Shoes: 💵 $804.900
117
+ 📏 Shoe Size: 38, 39, 40, 41, 42, 43.
118
+ ✅ Casual style with great comfort for daily use.
119
+
120
+ Which of these products do you like?`;
121
+
122
+ const chunks = splitChatText(llmResponse);
123
+ // [
124
+ // 'I found these options:',
125
+ // '🛍️ Pegasus Plus Shoes: 💵 $1.015.000\n📏 Color: ...\n✅ Ultra-lightweight...',
126
+ // '🛍️ ISPA Sense Shoes: 💵 $804.900\n📏 Shoe Size: ...\n✅ Casual style...',
127
+ // 'Which of these products do you like?',
128
+ // ]
129
+ ```
130
+
131
+ ---
132
+
133
+ ## Splitting Rules
134
+
135
+ The library applies processors in priority order. The first processor that finds a valid split point wins, and the remaining text is re-evaluated from the top.
136
+
137
+ ### Structural Splits (highest priority)
138
+
139
+ | Pattern | Behavior |
140
+ | -------------------------- | ------------------------------------------------------------------------------------ |
141
+ | **Intro + List** | Text ending with `:` followed by a numbered/bullet list splits after the intro |
142
+ | **Question + Numbered List** | Short question followed by numbered options stays together as one chunk |
143
+ | **Product Cards** | Numbered items with `🛍️` or `*Title*` formatting split into one chunk per card |
144
+ | **List Sections** | Numbered/bullet lists kept as one chunk; split per-item only when items are >150 chars |
145
+ | **Markdown Sections** | `*Header*` or `_Header_` with content splits at section boundaries |
146
+ | **Section Breaks** | Double newlines (`\n\n`) act as natural split points |
147
+
148
+ ### Semantic Splits (fallback)
149
+
150
+ | Pattern | Behavior |
151
+ | -------------------------- | ------------------------------------------------------------------------------------ |
152
+ | **Question Marks** | Splits after `?` unless followed by lowercase (sentence continuation) or emoji |
153
+ | **Contiguous Questions** | Multiple questions without periods between them stay together |
154
+ | **Period Splits** | Splits at `.` for text >100 chars, skipping protected positions |
155
+
156
+ ### Protected Content (never split inside)
157
+
158
+ | Content | Examples |
159
+ | -------------------------- | ------------------------------------------------------------------------------------ |
160
+ | **URLs** | `https://example.com/path`, `www.site.com` |
161
+ | **Plain Domains** | `Nike.com.co`, `shop.example.co.uk` |
162
+ | **Emails** | `juan.perez@gmail.com` |
163
+ | **Formatted Numbers** | `$1.000.000`, `2.5.1`, `15.5` |
164
+ | **Abbreviations** | `etc.`, `Dr.`, `D.C.`, `S.A.`, `E.U.A.` |
165
+ | **Parenthetical Expressions** | `(calle, número, referencia, etc.)` |
166
+ | **Bullet Point Content** | Content within `- item` or `• item` lines |
167
+
168
+ ### Post-processing
169
+
170
+ - **Small Chunk Merging:** Chunks under 20 characters merge with the next chunk (or previous, if last).
171
+ - **Spanish Punctuation:** After mid-sentence `¿` or `¡` (not at start or after `.`/`!`/`?`), the following letter is lowercased. Example: `ayudarte ¿Cómo estás?` becomes `ayudarte ¿cómo estás?`.
172
+
173
+ ---
174
+
175
+ ## API Reference
176
+
177
+ ### `splitChatText(text)`
178
+
179
+ ```typescript
180
+ function splitChatText(text: string | null | undefined): string[]
181
+ ```
182
+
183
+ Splits a markdown text string into an array of chat-ready chunks.
184
+
185
+ - **Input:** A string of markdown text (typically an LLM response). Accepts `null` or `undefined` safely.
186
+ - **Output:** An array of strings, each suitable for sending as an individual WhatsApp message.
187
+ - Returns `[]` for `null`, `undefined`, or empty string.
188
+
189
+ ```typescript
190
+ import { splitChatText } from '@llm-markdown-whatsapp/core';
191
+
192
+ const chunks = splitChatText(llmMarkdownText);
193
+ ```
194
+
195
+ ## Project Structure
196
+
197
+ ```
198
+ llm-markdown-whatsapp/
199
+ ├── packages/
200
+ │ └── core/ # Core splitting library
201
+ │ └── src/
202
+ │ ├── index.ts # Public API — exports splitChatText
203
+ │ └── chatSplit/
204
+ │ ├── splitChatText.ts # Main orchestrator
205
+ │ ├── splitProcessors.ts # Intro + list processors
206
+ │ ├── productCardProcessor.ts # Product card detection and splitting
207
+ │ ├── listProcessor.ts # Numbered/bullet list processing
208
+ │ ├── paragraphProcessor.ts # Long paragraph and markdown sections
209
+ │ ├── breakProcessor.ts # Double newline section breaks
210
+ │ ├── questionProcessor.ts # Question mark splitting logic
211
+ │ ├── periodProcessor.ts # Period splitting with protected ranges
212
+ │ ├── mergeProcessor.ts # Small chunk merging
213
+ │ ├── sections.ts # Markdown/list section detection
214
+ │ ├── textHelpers.ts # Smart trim, emoji detection, text utilities
215
+ │ ├── positionHelpers.ts # Parentheses/bullet position checks
216
+ │ ├── listNormalization.ts # Inline list normalization
217
+ │ ├── urlNormalization.ts # URL period removal
218
+ │ ├── punctuationNormalization.ts # Spanish ¿/¡ capitalization
219
+ │ ├── constants.ts # Threshold constants
220
+ │ └── splitConstants.ts # Split-specific constants
221
+ └── README.md
222
+ ```
223
+
224
+
225
+
226
+ ## Architecture
227
+
228
+ ```mermaid
229
+ flowchart TB
230
+ subgraph Input["Input"]
231
+ T["LLM markdown text"]
232
+ end
233
+
234
+ Input --> Pre
235
+
236
+ subgraph Pre["Pre-processing"]
237
+ direction LR
238
+ A["Normalize<br/>inline lists"]
239
+ B["Normalize<br/>product cards"]
240
+ C["Remove periods<br/>after URLs"]
241
+ end
242
+
243
+ Pre --> Processors
244
+
245
+ subgraph Processors["Processor Chain (priority order)"]
246
+ direction TB
247
+ P1["Intro + List<br/>Question + List<br/>Intro + Long Paragraphs"]
248
+ P2["Product Cards<br/>List Sections<br/>Long Paragraphs"]
249
+ P3["Markdown Sections<br/>Section Breaks (double newlines)"]
250
+ P4["Question Marks<br/>Period Splits"]
251
+ end
252
+
253
+ Processors --> Post
254
+
255
+ subgraph Post["Post-processing"]
256
+ direction LR
257
+ D["Merge small<br/>chunks"]
258
+ E["Normalize Spanish<br/>punctuation"]
259
+ end
260
+
261
+ Post --> Output
262
+
263
+ subgraph Output["Output"]
264
+ O["string[ ] — array of chat-ready chunks"]
265
+ end
266
+ ```
267
+
268
+ ---
269
+
270
+ ## Contributing
271
+
272
+ Contributions are welcome! Please:
273
+
274
+ 1. Fork the repository
275
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
276
+ 3. Write tests for your changes
277
+ 4. Ensure all tests pass (`npm test`)
278
+ 5. Ensure types check (`npm run typecheck`)
279
+ 6. Commit with a clear message
280
+ 7. Open a Pull Request
281
+
282
+ ## Development
283
+
284
+ ```bash
285
+ git clone <repository-url>
286
+ cd llm-markdown-whatsapp
287
+ npm install
288
+
289
+ npm run build # Build all packages
290
+ npm test # Run tests
291
+ npm run typecheck # Type check
292
+ npm run lint # Lint
293
+ npm run check # Format + lint + typecheck
294
+ ```
295
+
296
+ ## License
297
+
298
+ MIT License - see [LICENSE](LICENSE) for details.
299
+
300
+ ---
301
+
302
+ <p align="center">
303
+ Built with TypeScript • Zero Dependencies • WhatsApp-Optimized Chat Splitting
304
+ </p>
@@ -0,0 +1,28 @@
1
+ import js from '@eslint/js';
2
+ import love from 'eslint-config-love';
3
+ import { defineConfig } from 'eslint/config';
4
+ import globals from 'globals';
5
+ import tseslint from 'typescript-eslint';
6
+
7
+ export default defineConfig([
8
+ {
9
+ ignores: ['coverage/**', 'dist/**', '**/dist/**', 'node_modules/**', '**/node_modules/**', '*.config.js', '*.config.ts', 'examples/**'],
10
+ },
11
+ { files: ['**/*.{js,mjs,cjs,ts,mts,cts}'], plugins: { js }, extends: ['js/recommended'] },
12
+ { files: ['**/*.{js,mjs,cjs,ts,mts,cts}'], languageOptions: { globals: globals.node } },
13
+ {
14
+ ...love,
15
+ files: ['**/*.{ts,mts,cts}'],
16
+ },
17
+ tseslint.configs.recommended,
18
+ {
19
+ files: ['**/*.{js,mjs,cjs,ts,mts,cts}'],
20
+ rules: {
21
+ // Our custom rules (preserved)
22
+ 'max-lines-per-function': ['error', { max: 40, skipBlankLines: true, skipComments: true }],
23
+ 'max-depth': ['error', { max: 2 }],
24
+ 'max-lines': ['error', { max: 300, skipBlankLines: false, skipComments: true }],
25
+ curly: ['error', 'multi-line'],
26
+ },
27
+ },
28
+ ]);
package/jest.config.js ADDED
@@ -0,0 +1,40 @@
1
+ const config = {
2
+ clearMocks: true,
3
+ collectCoverage: true,
4
+ testTimeout: 120000,
5
+ coverageDirectory: 'coverage',
6
+ coverageProvider: 'v8',
7
+ extensionsToTreatAsEsm: ['.ts'],
8
+ moduleNameMapper: {
9
+ '^@globalTypes/(.*)\\.js$': '<rootDir>/src/types/$1',
10
+ '^@globalUtils/(.*)\\.js$': '<rootDir>/src/utils/$1',
11
+ '^@src/(.*)\\.js$': '<rootDir>/src/$1',
12
+ '^@globalTypes/(.*)$': '<rootDir>/src/types/$1',
13
+ '^@globalUtils/(.*)$': '<rootDir>/src/utils/$1',
14
+ '^@src/(.*)$': '<rootDir>/src/$1',
15
+ '^(\\.{1,2}/.*)\\.js$': '$1',
16
+ },
17
+ preset: 'ts-jest/presets/default-esm',
18
+ testEnvironment: 'node',
19
+ testMatch: ['**/__tests__/**/*.test.ts', '**/__tests__/**/*.spec.ts'],
20
+ transform: {
21
+ '^.+\\.tsx?$': [
22
+ 'ts-jest',
23
+ {
24
+ useESM: true,
25
+ tsconfig: {
26
+ module: 'NodeNext',
27
+ moduleResolution: 'nodenext',
28
+ target: 'ES2024',
29
+ allowSyntheticDefaultImports: true,
30
+ esModuleInterop: true,
31
+ allowImportingTsExtensions: true,
32
+ isolatedModules: true,
33
+ },
34
+ },
35
+ ],
36
+ },
37
+ transformIgnorePatterns: ['/node_modules/', '\\.pnp\\.[^\\/]+$'],
38
+ };
39
+
40
+ export default config;
package/package.json ADDED
@@ -0,0 +1,61 @@
1
+ {
2
+ "name": "@daviddh/llm-markdown-whatsapp",
3
+ "version": "0.0.1",
4
+ "private": false,
5
+ "description": "Transforms Markdown into WhatsApp text format monorepo",
6
+ "keywords": [
7
+ "llm",
8
+ "whatsapp",
9
+ "markdown",
10
+ "formatter",
11
+ "styling",
12
+ "utils",
13
+ "strs",
14
+ "strings",
15
+ "converter",
16
+ "transformer"
17
+ ],
18
+ "homepage": "https://github.com/daviddominguezh/llm-markdown-whatsapp#readme",
19
+ "bugs": {
20
+ "url": "https://github.com/daviddominguezh/llm-markdown-whatsapp/issues"
21
+ },
22
+ "repository": {
23
+ "type": "git",
24
+ "url": "git+https://github.com/daviddominguezh/llm-markdown-whatsapp.git"
25
+ },
26
+ "license": "MIT",
27
+ "author": "David Dominguez",
28
+ "type": "module",
29
+ "main": "packages/core/dist/index.js",
30
+ "workspaces": [
31
+ "packages/*"
32
+ ],
33
+ "scripts": {
34
+ "typecheck": "tsc -b",
35
+ "lint": "eslint .",
36
+ "format": "prettier --write \"**/*.{js,ts,json}\"",
37
+ "check": "npm run format && npm run lint && npm run typecheck",
38
+ "build": "npm run build --workspaces",
39
+ "build:core": "npm run build -w @llm-markdown-whatsapp/core",
40
+ "test": "npm run test --workspaces --if-present",
41
+ "test:core": "npm run test -w @llm-markdown-whatsapp/core"
42
+ },
43
+ "devDependencies": {
44
+ "@eslint/js": "^9.28.0",
45
+ "@trivago/prettier-plugin-sort-imports": "^5.2.2",
46
+ "@types/jest": "^30.0.0",
47
+ "eslint": "^9.28.0",
48
+ "eslint-config-love": "^144.0.0",
49
+ "globals": "^16.2.0",
50
+ "jest": "^30.0.5",
51
+ "prettier": "^3.8.1",
52
+ "ts-jest": "^29.4.4",
53
+ "tsc-alias": "^1.8.10",
54
+ "tsx": "^4.19.4",
55
+ "typescript": "^5.8.3",
56
+ "typescript-eslint": "^8.33.1"
57
+ },
58
+ "engines": {
59
+ "node": ">=18.0.0"
60
+ }
61
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=splitChatText.basic.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"splitChatText.basic.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/splitChatText.basic.test.ts"],"names":[],"mappings":""}