nx-json-parser 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,340 @@
1
+ /**
2
+ * Remark-based Markdown Parser
3
+ * Replaces regex-based parsing with AST-based parsing using unified/remark
4
+ */
5
+ import { unified } from 'unified';
6
+ import remarkParse from 'remark-parse';
7
+ import remarkGfm from 'remark-gfm';
8
+ import { visit } from 'unist-util-visit';
9
+ import { toString } from 'mdast-util-to-string';
10
+ import { toCamelCase } from 'nx-helpers';
11
+
12
+ export interface MarkdownSection {
13
+ heading: string;
14
+ content: string;
15
+ level: number;
16
+ }
17
+
18
+ export enum BulletMode {
19
+ ARRAY = 'array',
20
+ SECTIONS = 'sections',
21
+ AUTO = 'auto'
22
+ }
23
+
24
+ export interface RemarkParserOptions {
25
+ bulletMode?: BulletMode;
26
+ sectionKeywords?: string[];
27
+ debug?: boolean;
28
+ }
29
+
30
+ export class RemarkMarkdownParser {
31
+ private processor: any;
32
+ private options: Required<RemarkParserOptions>;
33
+
34
+ constructor(options: RemarkParserOptions = {}) {
35
+ this.processor = unified()
36
+ .use(remarkParse)
37
+ .use(remarkGfm);
38
+
39
+ this.options = {
40
+ bulletMode: options.bulletMode || BulletMode.AUTO,
41
+ sectionKeywords: options.sectionKeywords || this.getDefaultKeywords(),
42
+ debug: options.debug || false
43
+ };
44
+ }
45
+
46
+ /**
47
+ * Parse markdown into sections
48
+ */
49
+ parseSections(markdown: string): MarkdownSection[] {
50
+ const tree = this.processor.parse(markdown);
51
+
52
+ // Detect bullet mode if AUTO
53
+ const bulletMode = this.options.bulletMode === BulletMode.AUTO
54
+ ? this.detectBulletMode(tree)
55
+ : this.options.bulletMode;
56
+
57
+ if (this.options.debug) {
58
+ console.log(`📋 Bullet mode: ${bulletMode}`);
59
+ }
60
+
61
+ if (bulletMode === BulletMode.SECTIONS) {
62
+ return this.parseSectionsMode(tree, markdown);
63
+ } else {
64
+ return this.parseHeadingMode(tree);
65
+ }
66
+ }
67
+
68
+ /**
69
+ * Parse content (tables, lists, text)
70
+ */
71
+ parseContent(content: string): any {
72
+ const trimmed = content.trim();
73
+ if (!trimmed) return '';
74
+
75
+ const tree = this.processor.parse(trimmed);
76
+
77
+ // Single node - return specific type
78
+ if (tree.children.length === 1) {
79
+ const child = tree.children[0];
80
+
81
+ if (child.type === 'table') {
82
+ return this.parseTable(child);
83
+ }
84
+
85
+ if (child.type === 'list') {
86
+ return this.parseList(child);
87
+ }
88
+
89
+ if (child.type === 'paragraph') {
90
+ return toString(child);
91
+ }
92
+ }
93
+
94
+ // Multiple children - combine
95
+ const results: any[] = [];
96
+
97
+ for (const child of tree.children) {
98
+ if (child.type === 'table') {
99
+ return this.parseTable(child); // Single table dominates
100
+ } else if (child.type === 'list') {
101
+ return this.parseList(child); // Single list dominates
102
+ } else if (child.type === 'paragraph') {
103
+ results.push(toString(child));
104
+ } else {
105
+ results.push(toString(child));
106
+ }
107
+ }
108
+
109
+ // Join text results
110
+ const text = results.join('\n\n').trim();
111
+ return text || trimmed;
112
+ }
113
+
114
+ /**
115
+ * Convert sections to object
116
+ */
117
+ sectionsToObject(sections: MarkdownSection[]): Record<string, any> {
118
+ const result: Record<string, any> = {};
119
+
120
+ for (const section of sections) {
121
+ const key = toCamelCase(section.heading);
122
+ result[key] = this.parseContent(section.content);
123
+ }
124
+
125
+ return result;
126
+ }
127
+
128
+ // ========================================================================
129
+ // PRIVATE METHODS - Mode Detection
130
+ // ========================================================================
131
+
132
+ private detectBulletMode(tree: any): BulletMode {
133
+ let bulletCount = 0;
134
+ let bulletsWithContent = 0;
135
+ let bulletsWithColons = 0;
136
+ let bulletsWithNestedLists = 0;
137
+ let bulletsWithKeywords = 0;
138
+ let totalLength = 0;
139
+
140
+ visit(tree, 'list', (listNode: any, index?: number, parent?: any) => {
141
+ // Only analyze root-level lists
142
+ if (!parent || parent.type !== 'root') return;
143
+
144
+ for (const listItem of listNode.children) {
145
+ bulletCount++;
146
+
147
+ const firstChild = listItem.children[0];
148
+ const text = firstChild ? toString(firstChild) : '';
149
+ totalLength += text.length;
150
+
151
+ // Check indicators
152
+ if (text.includes(':')) bulletsWithColons++;
153
+ if (listItem.children.length > 1) bulletsWithContent++;
154
+
155
+ const hasNestedList = listItem.children.some((c: any) => c.type === 'list');
156
+ if (hasNestedList) bulletsWithNestedLists++;
157
+
158
+ const lowerText = text.toLowerCase();
159
+ if (this.options.sectionKeywords.some(kw => lowerText.includes(kw))) {
160
+ bulletsWithKeywords++;
161
+ }
162
+ }
163
+ });
164
+
165
+ if (bulletCount === 0) return BulletMode.ARRAY;
166
+
167
+ const avgLength = totalLength / bulletCount;
168
+
169
+ // Scoring
170
+ let sectionScore = 0;
171
+ let arrayScore = 0;
172
+
173
+ if (bulletsWithColons > 0) sectionScore += 3;
174
+ if (bulletsWithContent > 0) sectionScore += 3;
175
+ if (bulletsWithNestedLists > 0) sectionScore += 2;
176
+ if (bulletsWithKeywords > 0) sectionScore += 2;
177
+ if (avgLength > 30) sectionScore += 1;
178
+
179
+ if (bulletsWithContent === 0 && bulletsWithColons === 0) arrayScore += 4;
180
+ if (avgLength < 30) arrayScore += 2;
181
+ if (bulletCount >= 3 && bulletsWithContent === 0) arrayScore += 2;
182
+
183
+ if (this.options.debug) {
184
+ console.log(`🔍 Detection scores - Sections: ${sectionScore}, Array: ${arrayScore}`);
185
+ }
186
+
187
+ return sectionScore > arrayScore ? BulletMode.SECTIONS : BulletMode.ARRAY;
188
+ }
189
+
190
+ // ========================================================================
191
+ // PRIVATE METHODS - Parsing Modes
192
+ // ========================================================================
193
+
194
+ private parseHeadingMode(tree: any): MarkdownSection[] {
195
+ const sections: MarkdownSection[] = [];
196
+ let currentSection: MarkdownSection | null = null;
197
+ let currentContent: string[] = [];
198
+
199
+ for (const node of tree.children) {
200
+ if (node.type === 'heading') {
201
+ // Save previous section
202
+ if (currentSection) {
203
+ currentSection.content = currentContent.join('\n\n').trim();
204
+ sections.push(currentSection);
205
+ }
206
+
207
+ // Start new section
208
+ currentSection = {
209
+ heading: toString(node),
210
+ content: '',
211
+ level: node.depth
212
+ };
213
+ currentContent = [];
214
+ } else if (currentSection) {
215
+ // Add to current section content
216
+ currentContent.push(this.nodeToString(node));
217
+ }
218
+ }
219
+
220
+ // Save last section
221
+ if (currentSection) {
222
+ currentSection.content = currentContent.join('\n\n').trim();
223
+ sections.push(currentSection);
224
+ }
225
+
226
+ return sections;
227
+ }
228
+
229
+ private parseSectionsMode(tree: any, markdown: string): MarkdownSection[] {
230
+ const sections: MarkdownSection[] = [];
231
+ let inList = false;
232
+
233
+ for (const node of tree.children) {
234
+ if (node.type === 'heading') {
235
+ // Regular heading
236
+ const heading = toString(node);
237
+ const content = ''; // Will be filled by next nodes
238
+ sections.push({
239
+ heading,
240
+ content,
241
+ level: node.depth
242
+ });
243
+ } else if (node.type === 'list') {
244
+ // Process each list item as a section
245
+ for (const listItem of node.children) {
246
+ const section = this.listItemToSection(listItem);
247
+ if (section) {
248
+ sections.push(section);
249
+ }
250
+ }
251
+ } else if (sections.length > 0) {
252
+ // Add content to last section
253
+ const lastSection = sections[sections.length - 1];
254
+ const nodeContent = this.nodeToString(node);
255
+ lastSection.content = lastSection.content
256
+ ? `${lastSection.content}\n\n${nodeContent}`
257
+ : nodeContent;
258
+ }
259
+ }
260
+
261
+ return sections;
262
+ }
263
+
264
+ private listItemToSection(listItem: any): MarkdownSection | null {
265
+ if (!listItem.children || listItem.children.length === 0) {
266
+ return null;
267
+ }
268
+
269
+ const firstChild = listItem.children[0];
270
+ const heading = toString(firstChild).replace(/:$/, ''); // Remove trailing colon
271
+
272
+ const contentNodes = listItem.children.slice(1);
273
+ let content = '';
274
+
275
+ if (contentNodes.length > 0) {
276
+ content = contentNodes.map((node: any) => this.nodeToString(node)).join('\n\n').trim();
277
+ }
278
+
279
+ return {
280
+ heading,
281
+ content,
282
+ level: 1
283
+ };
284
+ }
285
+
286
+ // ========================================================================
287
+ // PRIVATE METHODS - Content Parsing
288
+ // ========================================================================
289
+
290
+ private parseTable(tableNode: any): any[] {
291
+ const rows = tableNode.children;
292
+ if (rows.length === 0) return [];
293
+
294
+ // First row = headers
295
+ const headerRow = rows[0];
296
+ const headers = headerRow.children.map((cell: any) =>
297
+ toCamelCase(toString(cell).trim())
298
+ );
299
+
300
+ // Data rows
301
+ return rows.slice(1).map((row: any) => {
302
+ const obj: any = {};
303
+ row.children.forEach((cell: any, i: number) => {
304
+ const key = headers[i] || `column${i}`;
305
+ obj[key] = toString(cell).trim();
306
+ });
307
+ return obj;
308
+ });
309
+ }
310
+
311
+ private parseList(listNode: any): string[] {
312
+ return listNode.children.map((item: any) => {
313
+ // Get first child only (ignore nested content for simple lists)
314
+ const firstChild = item.children[0];
315
+ return toString(firstChild).trim();
316
+ });
317
+ }
318
+
319
+ private nodeToString(node: any): string {
320
+ if (node.type === 'table') {
321
+ // Return markdown representation or JSON
322
+ return JSON.stringify(this.parseTable(node));
323
+ }
324
+ if (node.type === 'list') {
325
+ // Return as list
326
+ return this.parseList(node).map(item => `- ${item}`).join('\n');
327
+ }
328
+ return toString(node);
329
+ }
330
+
331
+ private getDefaultKeywords(): string[] {
332
+ return [
333
+ 'answer', 'summary', 'introduction', 'conclusion', 'overview',
334
+ 'assumptions', 'unknowns', 'evidence', 'notes', 'details',
335
+ 'description', 'background', 'analysis', 'findings', 'recommendations',
336
+ 'data', 'identity', 'network', 'security', 'monitoring', 'governance',
337
+ 'availability', 'backup', 'patch', 'operational', 'provider'
338
+ ];
339
+ }
340
+ }
package/src/types.ts CHANGED
@@ -1,13 +1,29 @@
1
+ export enum BulletMode {
2
+ ARRAY = 'array',
3
+ SECTIONS = 'sections',
4
+ AUTO = 'auto'
5
+ }
6
+
1
7
  export interface MarkdownSection {
2
8
  heading: string;
3
- content: any; // Can be string, array of objects (for tables), etc.
9
+ content: any;
4
10
  level: number;
5
- format: 'heading' | 'list' | 'table' | 'text';
11
+ format?: 'heading' | 'bullet' | 'text';
6
12
  }
7
13
 
8
- export type ParseResult = Record<string, any>;
14
+ export interface ParserOptions {
15
+ bulletMode?: BulletMode;
16
+ debug?: boolean;
17
+ }
9
18
 
10
- export interface OutputFormatSpec {
11
- // Define if the user wants to enforce a specific schema later
12
- [key: string]: any;
19
+ export interface BulletModeResult {
20
+ mode: 'array' | 'sections';
21
+ confidence: number;
22
+ reasons: string[];
13
23
  }
24
+
25
+ // ADD THIS:
26
+ export interface ParseResult {
27
+ sections?: MarkdownSection[];
28
+ [key: string]: any; // Allow any string keys with any values
29
+ }
@@ -0,0 +1,99 @@
1
+ import { markdownToJson } from '../src/index.js';
2
+ import { RemarkParser } from '../src/parser.js';
3
+ import { BulletMode } from '../src/types.js';
4
+
5
+ describe('Bullet Mode Detection', () => {
6
+
7
+ it('should auto-detect simple arrays', () => {
8
+ const md = `
9
+ - Item 1
10
+ - Item 2
11
+ - Item 3
12
+ `;
13
+ const result = markdownToJson(md);
14
+ // Should be an array, or if it has no heading, maybe it's just the root content?
15
+ // In current implementation: "No sections found, put everything in root".
16
+ // Root content for a list in ARRAY mode is string[].
17
+
18
+ // However, JSONTransformer converts sections to an object.
19
+ // If there is only 'Root' section, markdownToJson returns { root: [...] } ?
20
+ // Let's check JSONTransformer logic.
21
+ // keys are camelCase headings. 'Root' -> 'root'.
22
+
23
+ expect(result.root).toEqual(['Item 1', 'Item 2', 'Item 3']);
24
+ });
25
+
26
+ it('should auto-detect sections', () => {
27
+ const md = `
28
+ - Short Answer
29
+ The sky is blue.
30
+
31
+ - Evidence
32
+ 1. Look up.
33
+ 2. See blue.
34
+ `;
35
+ const result = markdownToJson(md);
36
+ // Should explode into keys
37
+ expect(result.shortAnswer).toBe('The sky is blue.');
38
+ // Evidence has a nested list.
39
+ // In SECTIONS mode, nested list becomes content.
40
+ // processBulletAsSection logic: "Check for nested list... ONLY a nested list -> convert to array"
41
+ // OR "Mixed content -> object { text, items }"
42
+ // Here we have "1. Look up..." which is an ordered list.
43
+ // Our logic handles 'list' nodes inside the bullet item.
44
+
45
+ // Wait, "1. Look up" is an ordered list. detect-bullet-mode uses 'list' type check.
46
+ // processBulletAsSection finds `node.type === 'list'`.
47
+
48
+ expect(result.evidence).toBeDefined();
49
+ expect(Array.isArray(result.evidence)).toBe(true);
50
+ expect(result.evidence[0]).toContain('Look up');
51
+ });
52
+
53
+ it('should respect manual override to ARRAY', () => {
54
+ // A string that LOOKS like sections but we force ARRAY
55
+ const md = `
56
+ - Short Answer
57
+ - Evidence
58
+ `;
59
+ // If auto, might think it's array (no content).
60
+ // Let's make it look like sections:
61
+ const mdSec = `
62
+ - Section A:
63
+ Content A
64
+ - Section B:
65
+ Content B
66
+ `;
67
+
68
+ const parser = new RemarkParser({ bulletMode: BulletMode.ARRAY });
69
+ // We need to bypass JSONTransformer for a direct parser test, or pass parser to transformer
70
+ // JSONTransformer can take a parser in options.
71
+
72
+ // But markdownToJson doesn't expose options. We'll use the class directly.
73
+ const sections = parser.parse(mdSec);
74
+
75
+ // In ARRAY mode, it should be treated as a single list under 'Root' (or wherever it is)
76
+ // and the content should be a flattened string array of the bullets text.
77
+
78
+ expect(sections.length).toBe(1);
79
+ expect(sections[0]?.heading).toBe('Root');
80
+ expect(Array.isArray(sections[0]?.content)).toBe(true);
81
+ expect(sections[0]?.content[0]).toContain('Section A'); // Just the text
82
+ });
83
+
84
+ it('should respect manual override to SECTIONS', () => {
85
+ // A string that LOOKS like array
86
+ const md = `
87
+ - Item 1
88
+ - Item 2
89
+ `;
90
+ const parser = new RemarkParser({ bulletMode: BulletMode.SECTIONS });
91
+ const sections = parser.parse(md);
92
+
93
+ // Should try to explode them.
94
+ // "Item 1" becomes a heading, content empty.
95
+ expect(sections.length).toBe(2);
96
+ expect(sections[0]?.heading).toBe('Item 1');
97
+ expect(sections[1]?.heading).toBe('Item 2');
98
+ });
99
+ });