nx-json-parser 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parser.ts CHANGED
@@ -2,18 +2,42 @@ import { unified } from 'unified';
2
2
  import remarkParse from 'remark-parse';
3
3
  import remarkGfm from 'remark-gfm';
4
4
  import { toString } from 'mdast-util-to-string';
5
- import { MarkdownSection } from './types.js';
6
- import { remarkBulletSections } from './plugins/bullet-sections.js';
5
+ import { MarkdownSection, BulletMode, ParserOptions } from './types.js';
6
+ import { detectBulletMode } from './plugins/detect-bullet-mode.js';
7
7
  import { toCamelCase } from 'nx-helpers';
8
8
 
9
9
  export class RemarkParser {
10
10
  private processor = unified()
11
11
  .use(remarkParse)
12
- .use(remarkGfm)
13
- .use(remarkBulletSections);
12
+ .use(remarkGfm);
13
+
14
+ private options: Required<ParserOptions>;
15
+
16
+ constructor(options: ParserOptions = {}) {
17
+ this.options = {
18
+ bulletMode: options.bulletMode ?? BulletMode.AUTO,
19
+ debug: options.debug ?? false
20
+ };
21
+ }
14
22
 
15
23
  parse(markdown: string): MarkdownSection[] {
16
24
  const tree = this.processor.runSync(this.processor.parse(markdown));
25
+
26
+ // Detect bullet mode if AUTO
27
+ let bulletMode = this.options.bulletMode;
28
+ if (bulletMode === BulletMode.AUTO) {
29
+ const detection = detectBulletMode(tree);
30
+ bulletMode = detection.mode === 'sections' ? BulletMode.SECTIONS : BulletMode.ARRAY;
31
+
32
+ if (this.options.debug) {
33
+ console.log('🔍 Bullet Mode Detection:');
34
+ console.log(` Mode: ${detection.mode}`);
35
+ console.log(` Confidence: ${(detection.confidence * 100).toFixed(0)}%`);
36
+ console.log(` Reasons:`);
37
+ detection.reasons.forEach(r => console.log(` - ${r}`));
38
+ }
39
+ }
40
+
17
41
  const sections: MarkdownSection[] = [];
18
42
  let currentSection: MarkdownSection | null = null;
19
43
  let currentNodes: any[] = [];
@@ -22,9 +46,9 @@ export class RemarkParser {
22
46
 
23
47
  for (const node of rootChildren) {
24
48
  if (node.type === 'heading') {
49
+ // Standard heading section (### Section)
25
50
  if (currentSection) {
26
- currentSection.content = this.processContent(currentNodes);
27
- sections.push(currentSection);
51
+ this.mergeAndPushSection(sections, currentSection, currentNodes, bulletMode);
28
52
  }
29
53
 
30
54
  currentSection = {
@@ -34,18 +58,95 @@ export class RemarkParser {
34
58
  format: 'heading'
35
59
  };
36
60
  currentNodes = [];
61
+
62
+ } else if (node.type === 'list' && !node.ordered && bulletMode === BulletMode.SECTIONS) {
63
+ // Bullet-style sections mode: each root-level bullet is a section
64
+ if (currentSection) {
65
+ this.mergeAndPushSection(sections, currentSection, currentNodes, bulletMode);
66
+ currentSection = null;
67
+ currentNodes = [];
68
+ }
69
+
70
+ // Process each list item as a section
71
+ const newSections: MarkdownSection[] = [];
72
+
73
+ // If we have a current section pending content, and this list is just text items, maybe it belongs to currentSection?
74
+ // But detect-bullet-mode says SECTIONS. So we expect sections.
75
+
76
+ for (const listItem of node.children) {
77
+ newSections.push(...this.processBulletAsSection(listItem));
78
+ }
79
+
80
+ if (newSections.length > 0) {
81
+ // Refined Sibling Logic:
82
+ // If a section has empty content, and is followed by sections that look like simple items...
83
+ // But processBulletAsSection transforms every item into a section.
84
+ // We need to merge them if they are actually content.
85
+
86
+ // Post-processing merge:
87
+ const mergedSections: MarkdownSection[] = [];
88
+ let activeSection: MarkdownSection | null = null;
89
+
90
+ for (const sec of newSections) {
91
+ // Heuristic: Is this really a section or just an item?
92
+ // If detect-bullet-mode said SECTIONS, usually we trust it.
93
+ // But for mixed lists: "- Assumptions" (sec) "- Item 1" (sec?) "- Item 2" (sec?)
94
+
95
+ // Check if 'sec' looks like a property or a value.
96
+ // Ideally strictly, everything is a property.
97
+ // But in the test case "- Assumptions \n - Val 1 \n - Val 2",
98
+ // they are siblings.
99
+
100
+ if (activeSection && this.shouldMergeAsContent(activeSection, sec)) {
101
+ // Merge content
102
+ if (!activeSection.content) {
103
+ activeSection.content = [sec.heading]; // Use heading as value
104
+ } else if (Array.isArray(activeSection.content)) {
105
+ activeSection.content.push(sec.heading);
106
+ } else {
107
+ // Convert existing string content to array? Or mix?
108
+ activeSection.content = [activeSection.content, sec.heading];
109
+ }
110
+ // APPEND any content of sec as well?
111
+ if (sec.content) {
112
+ if (Array.isArray(sec.content)) {
113
+ (activeSection.content as any[]).push(...sec.content);
114
+ } else {
115
+ (activeSection.content as any[]).push(sec.content);
116
+ }
117
+ }
118
+ } else {
119
+ activeSection = sec;
120
+ mergedSections.push(sec);
121
+ }
122
+ }
123
+
124
+ // Push all except the last one as complete sections
125
+ if (mergedSections.length > 0) {
126
+ for (let i = 0; i < mergedSections.length - 1; i++) {
127
+ sections.push(mergedSections[i]!);
128
+ }
129
+ // Keep the last one as currentSection to capture subsequent loose content
130
+ currentSection = mergedSections[mergedSections.length - 1]!;
131
+ }
132
+ }
133
+
134
+
135
+ // Original simple loop removed in favor of refined logic above
136
+
37
137
  } else {
38
138
  currentNodes.push(node);
39
139
  }
40
140
  }
41
141
 
142
+ // Save last section
42
143
  if (currentSection) {
43
- currentSection.content = this.processContent(currentNodes);
44
- sections.push(currentSection);
144
+ this.mergeAndPushSection(sections, currentSection, currentNodes, bulletMode);
45
145
  } else if (currentNodes.length > 0) {
146
+ // No sections found, put everything in root
46
147
  sections.push({
47
148
  heading: 'Root',
48
- content: this.processContent(currentNodes),
149
+ content: this.processContent(currentNodes, bulletMode),
49
150
  level: 0,
50
151
  format: 'text'
51
152
  });
@@ -54,31 +155,139 @@ export class RemarkParser {
54
155
  return sections;
55
156
  }
56
157
 
57
- private processContent(nodes: any[]): any {
158
+ /**
159
+ * Process content nodes into appropriate format
160
+ */
161
+ private processContent(nodes: any[], bulletMode: BulletMode): any {
58
162
  if (nodes.length === 0) return '';
59
163
 
164
+ // Single table - return as array of objects
60
165
  if (nodes.length === 1 && nodes[0].type === 'table') {
61
166
  return this.tableToArray(nodes[0]);
62
167
  }
63
168
 
169
+ // Single list - behavior depends on mode
64
170
  if (nodes.length === 1 && nodes[0].type === 'list') {
65
- return nodes[0].children.map((item: any) => toString(item).trim());
171
+ if (bulletMode === BulletMode.ARRAY) {
172
+ // Mode 1: Return as simple array of strings
173
+ return this.listToArray(nodes[0]);
174
+ } else {
175
+ // Mode 2: In sections mode, nested lists are still arrays
176
+ return this.listToArray(nodes[0]);
177
+ }
66
178
  }
67
179
 
180
+ // Multiple nodes or mixed content
68
181
  return nodes.map(node => {
69
182
  if (node.type === 'table') {
70
- return JSON.stringify(this.tableToArray(node));
183
+ return this.tableToArray(node);
184
+ }
185
+ if (node.type === 'list') {
186
+ return this.listToArray(node);
71
187
  }
72
188
  return toString(node);
73
189
  }).join('\n\n').trim();
74
190
  }
75
191
 
192
+ /**
193
+ * Process a bullet list item as a section (Mode 2: SECTIONS)
194
+ */
195
+ private processBulletAsSection(listItem: any): MarkdownSection[] {
196
+ const sections: MarkdownSection[] = [];
197
+
198
+ // First child is the bullet text (section heading)
199
+ const firstChild = listItem.children[0];
200
+ if (!firstChild) return sections;
201
+
202
+ const rawText = toString(firstChild).trim();
203
+ const lines = rawText.split('\n');
204
+ const heading = lines[0]?.trim() || '';
205
+ const sameNodeContent = lines.slice(1).join('\n').trim();
206
+
207
+ // Clean heading: remove trailing colon if present
208
+ const cleanHeading = heading.replace(/:$/, '');
209
+
210
+ // Rest of the children are the content
211
+ const contentNodes = listItem.children.slice(1);
212
+
213
+ let content: any;
214
+
215
+ if (contentNodes.length === 0) {
216
+ // No extra nodes, but maybe same-node content?
217
+ content = sameNodeContent;
218
+ } else {
219
+ // If we have sameNodeContent, we should prepend it to the text content?
220
+ // This gets complicated if contentNodes are mixed.
221
+ // Simplified: If sameNodeContent exists, assume it's part of the text.
222
+ // Check for nested list
223
+ const nestedList = contentNodes.find((node: any) => node.type === 'list');
224
+
225
+ if (nestedList && contentNodes.length === 1) {
226
+ // ONLY a nested list - convert to array
227
+ content = this.listToArray(nestedList);
228
+ } else if (nestedList) {
229
+ // Mixed content: paragraphs + nested list
230
+ // For now, convert nested list to array and combine
231
+ const paragraphs = contentNodes
232
+ .filter((node: any) => node.type !== 'list')
233
+ .map((node: any) => toString(node))
234
+ .join('\n\n')
235
+ .trim();
236
+
237
+ const fullText = sameNodeContent ? sameNodeContent + '\n\n' + paragraphs : paragraphs;
238
+
239
+ const nestedArray = this.listToArray(nestedList);
240
+
241
+ // Combine: return object with text and items
242
+ content = {
243
+ text: fullText,
244
+ items: nestedArray
245
+ };
246
+ } else {
247
+ // Flatten all other children into text
248
+ const nodeText = contentNodes
249
+ .map((node: any) => this.nodeToString(node))
250
+ .join('\n\n')
251
+ .trim();
252
+ content = sameNodeContent ? sameNodeContent + '\n\n' + nodeText : nodeText;
253
+ }
254
+ }
255
+
256
+ sections.push({
257
+ heading: cleanHeading,
258
+ content,
259
+ level: 1,
260
+ format: 'bullet'
261
+ });
262
+
263
+ return sections;
264
+ }
265
+
266
+ /**
267
+ * Convert list to simple array of strings (Mode 1: ARRAY)
268
+ */
269
+ private listToArray(listNode: any): string[] {
270
+ return listNode.children.map((item: any) => {
271
+ // Get just the first paragraph/text, ignore nested content
272
+ const firstChild = item.children[0];
273
+ return toString(firstChild).trim();
274
+ });
275
+ }
276
+
277
+ /**
278
+ * Convert table to array of objects
279
+ */
76
280
  private tableToArray(tableNode: any): any[] {
77
- const headers = tableNode.children[0].children.map((cell: any) =>
281
+ const rows = tableNode.children;
282
+ if (rows.length === 0) return [];
283
+
284
+ // First row = headers
285
+ const headers = rows[0].children.map((cell: any) =>
78
286
  toCamelCase(toString(cell).trim())
79
287
  );
80
288
 
81
- return tableNode.children.slice(1).map((row: any) => {
289
+ // Remaining rows = data
290
+ return rows.slice(1).map((row: any) => {
82
291
  const obj: any = {};
83
292
  row.children.forEach((cell: any, i: number) => {
84
293
  const key = headers[i] || `column${i}`;
@@ -87,4 +296,82 @@ export class RemarkParser {
87
296
  return obj;
88
297
  });
89
298
  }
90
- }
299
+
300
+ /**
301
+ * Convert sections to object (utility method)
302
+ */
303
+ sectionsToObject(sections: MarkdownSection[]): Record<string, any> {
304
+ const result: Record<string, any> = {};
305
+
306
+ for (const section of sections) {
307
+ const key = toCamelCase(section.heading);
308
+ result[key] = section.content;
309
+ }
310
+
311
+ return result;
312
+ }
313
+
314
+ private mergeAndPushSection(sections: MarkdownSection[], section: MarkdownSection, nodes: any[], bulletMode: BulletMode) {
315
+ // ... (existing implementation)
316
+ const newContent = this.processContent(nodes, bulletMode);
317
+
318
+ if (newContent) {
319
+ if (!section.content) {
320
+ section.content = newContent;
321
+ } else if (typeof section.content === 'string' && typeof newContent === 'string') {
322
+ section.content += '\n\n' + newContent;
323
+ } else {
324
+ if (Array.isArray(section.content)) {
325
+ section.content.push(newContent);
326
+ } else {
327
+ section.content = [section.content, newContent];
328
+ }
329
+ }
330
+ }
331
+
332
+ sections.push(section);
333
+ }
334
+
335
+ private nodeToString(node: any): string {
336
+ if (node.type === 'table') {
337
+ return JSON.stringify(this.tableToArray(node));
338
+ }
339
+ if (node.type === 'list') {
340
+ return this.listToArray(node).map(item => `- ${item}`).join('\n');
341
+ }
342
+ return toString(node);
343
+ }
344
+ private shouldMergeAsContent(parent: MarkdownSection, child: MarkdownSection): boolean {
345
+ // Decide if 'child' should be merged into 'parent' content instead of being a new section.
346
+
347
+ // 1. If child has no content and is short?
348
+ // 2. If parent matches known strictly-list sections (Assumptions, Evidence, Unknowns)
349
+
350
+ // List of keys that usually contain lists of items as siblings in badly formatted markdown
351
+ const LIST_SECTIONS = [
352
+ 'assumptions',
353
+ 'unknowns',
354
+ 'evidence',
355
+ 'examples',
356
+ 'notes',
357
+ 'analysis',
358
+ 'findings',
359
+ 'recommendations',
360
+ 'considerations'
361
+ ];
362
+ const parentKey = toCamelCase(parent.heading);
363
+ const childKey = toCamelCase(child.heading);
364
+
365
+ // If the child is itself a known list section, do NOT merge it.
366
+ // It should be a new section.
367
+ if (LIST_SECTIONS.includes(childKey)) {
368
+ return false;
369
+ }
370
+
371
+ if (LIST_SECTIONS.includes(parentKey)) {
372
+ return true;
373
+ }
374
+
375
+ return false;
376
+ }
377
+ }
@@ -1,86 +1,61 @@
1
- import { toString } from 'mdast-util-to-string';
2
-
3
- export function remarkBulletSections() {
4
- return (tree: any) => {
5
- const children = tree.children;
6
- if (!children) return;
7
-
8
- for (let i = 0; i < children.length; i++) {
9
- const node = children[i];
10
- if (node.type === 'list' && node.ordered === false) {
11
- const items = node.children;
12
- if (items.length === 0) continue;
13
-
14
- const newRootNodes: any[] = [];
15
- let currentListItems: any[] = [];
16
-
17
- items.forEach((item: any, idx: number) => {
18
- const firstChild = item.children[0];
19
- const text = firstChild ? toString(firstChild) : '';
20
- const lines = text.trim().split('\n');
21
- const firstLine = lines[0]?.trim() || '';
22
1
 
23
- const isShort = firstLine.length > 0 && firstLine.length < 150;
24
- const hasMoreContent = lines.length > 1 || item.children.length > 1;
25
-
26
- // Section detection:
27
- // 1. It's short and has more content.
28
- // 2. OR it's short and is followed by a non-short item? (Hard to check here)
29
- // 3. OR it's one of the "known" section keywords.
30
- const keywords = ['answer', 'assumptions', 'unknowns', 'evidence', 'protection', 'control', 'management', 'design', 'logging', 'monitoring', 'backups', 'compliance', 'governance', 'modeling', 'incident', 'vendor', 'changes'];
31
- const isSectionKeyword = keywords.some(k => firstLine.toLowerCase().includes(k));
32
-
33
- if (isShort && (hasMoreContent || isSectionKeyword)) {
34
- // Flush existing list items if any
35
- if (currentListItems.length > 0) {
36
- newRootNodes.push({
37
- type: 'list',
38
- ordered: false,
39
- children: [...currentListItems]
40
- });
41
- currentListItems = [];
42
- }
43
-
44
- // Add as heading
45
- newRootNodes.push({
46
- type: 'heading',
47
- depth: 2,
48
- children: [{ type: 'text', value: firstLine }]
49
- });
50
-
51
- // Add content
52
- if (lines.length > 1) {
53
- newRootNodes.push({
54
- type: 'paragraph',
55
- children: [{ type: 'text', value: lines.slice(1).join('\n').trim() }]
56
- });
57
- }
58
- if (item.children.length > 1) {
59
- newRootNodes.push(...item.children.slice(1));
60
- }
61
- } else {
62
- currentListItems.push(item);
63
- }
64
- });
2
+ /**
3
+ * Remark plugin to handle bullet-style sections
4
+ * Converts bullets that look like sections into heading nodes
5
+ */
6
+ import { visit } from 'unist-util-visit';
7
+ import { toString } from 'mdast-util-to-string';
65
8
 
66
- // Flush remaining
67
- if (currentListItems.length > 0) {
68
- newRootNodes.push({
69
- type: 'list',
70
- ordered: false,
71
- children: [...currentListItems]
72
- });
73
- }
9
+ const SECTION_KEYWORDS = [
10
+ 'answer', 'summary', 'introduction', 'conclusion', 'overview',
11
+ 'assumptions', 'unknowns', 'evidence', 'notes', 'details',
12
+ 'description', 'background', 'analysis', 'findings', 'recommendations',
13
+ 'data', 'identity', 'network', 'security', 'monitoring', 'governance',
14
+ 'availability', 'backup', 'patch', 'operational', 'provider'
15
+ ];
74
16
 
75
- if (newRootNodes.length > 0) {
76
- // If we performed any transformation (i.e., we found at least one heading)
77
- const hasHeadings = newRootNodes.some(n => n.type === 'heading');
78
- if (hasHeadings) {
79
- children.splice(i, 1, ...newRootNodes);
80
- i += newRootNodes.length - 1;
81
- }
82
- }
83
- }
84
- }
85
- };
86
- }
17
+ export function remarkBulletSections() {
18
+ return (tree: any) => {
19
+ const transformations: Array<{
20
+ parent: any;
21
+ index: number;
22
+ listItem: any;
23
+ }> = [];
24
+
25
+ // First pass: identify bullets that should become headings
26
+ visit(tree, 'list', (node: any, index?: number, parent?: any) => {
27
+ if (!parent || node.ordered) return; // Only unordered lists
28
+
29
+ // Don't transform if there are no section bullets
30
+ // This preserves normal list behavior
31
+ if (transformations.length === 0) {
32
+ return tree;
33
+ }
34
+
35
+ // For now, just mark them for detection
36
+ // The actual transformation happens in the parser
37
+ return tree;
38
+ });
39
+ }}
40
+
41
+ function isSectionBullet(listItem: any): boolean {
42
+ if (!listItem.children || listItem.children.length === 0) {
43
+ return false;
44
+ }
45
+
46
+ const firstChild = listItem.children[0];
47
+ const text = toString(firstChild).toLowerCase();
48
+
49
+ // Check for section indicators
50
+ const hasKeyword = SECTION_KEYWORDS.some(kw => text.includes(kw));
51
+ const hasColon = text.includes(':');
52
+ const hasContent = listItem.children.length > 1;
53
+ const isCapitalized = /^[A-Z]/.test(text);
54
+ const isLong = text.length > 30;
55
+
56
+ // It's likely a section if it has multiple indicators
57
+ const indicators = [hasKeyword, hasColon, hasContent, isCapitalized, isLong];
58
+ const score = indicators.filter(Boolean).length;
59
+
60
+ return score >= 2;
61
+ }
@@ -0,0 +1,86 @@
1
+ import { toString } from 'mdast-util-to-string';
2
+
3
+ export function remarkBulletSections() {
4
+ return (tree: any) => {
5
+ const children = tree.children;
6
+ if (!children) return;
7
+
8
+ for (let i = 0; i < children.length; i++) {
9
+ const node = children[i];
10
+ if (node.type === 'list' && node.ordered === false) {
11
+ const items = node.children;
12
+ if (items.length === 0) continue;
13
+
14
+ const newRootNodes: any[] = [];
15
+ let currentListItems: any[] = [];
16
+
17
+ items.forEach((item: any, idx: number) => {
18
+ const firstChild = item.children[0];
19
+ const text = firstChild ? toString(firstChild) : '';
20
+ const lines = text.trim().split('\n');
21
+ const firstLine = lines[0]?.trim() || '';
22
+
23
+ const isShort = firstLine.length > 0 && firstLine.length < 150;
24
+ const hasMoreContent = lines.length > 1 || item.children.length > 1;
25
+
26
+ // Section detection:
27
+ // 1. It's short and has more content.
28
+ // 2. OR it's short and is followed by a non-short item? (Hard to check here)
29
+ // 3. OR it's one of the "known" section keywords.
30
+ const keywords = ['answer', 'assumptions', 'unknowns', 'evidence', 'protection', 'control', 'management', 'design', 'logging', 'monitoring', 'backups', 'compliance', 'governance', 'modeling', 'incident', 'vendor', 'changes'];
31
+ const isSectionKeyword = keywords.some(k => firstLine.toLowerCase().includes(k));
32
+
33
+ if (isShort && (hasMoreContent || isSectionKeyword)) {
34
+ // Flush existing list items if any
35
+ if (currentListItems.length > 0) {
36
+ newRootNodes.push({
37
+ type: 'list',
38
+ ordered: false,
39
+ children: [...currentListItems]
40
+ });
41
+ currentListItems = [];
42
+ }
43
+
44
+ // Add as heading
45
+ newRootNodes.push({
46
+ type: 'heading',
47
+ depth: 2,
48
+ children: [{ type: 'text', value: firstLine }]
49
+ });
50
+
51
+ // Add content
52
+ if (lines.length > 1) {
53
+ newRootNodes.push({
54
+ type: 'paragraph',
55
+ children: [{ type: 'text', value: lines.slice(1).join('\n').trim() }]
56
+ });
57
+ }
58
+ if (item.children.length > 1) {
59
+ newRootNodes.push(...item.children.slice(1));
60
+ }
61
+ } else {
62
+ currentListItems.push(item);
63
+ }
64
+ });
65
+
66
+ // Flush remaining
67
+ if (currentListItems.length > 0) {
68
+ newRootNodes.push({
69
+ type: 'list',
70
+ ordered: false,
71
+ children: [...currentListItems]
72
+ });
73
+ }
74
+
75
+ if (newRootNodes.length > 0) {
76
+ // If we performed any transformation (i.e., we found at least one heading)
77
+ const hasHeadings = newRootNodes.some(n => n.type === 'heading');
78
+ if (hasHeadings) {
79
+ children.splice(i, 1, ...newRootNodes);
80
+ i += newRootNodes.length - 1;
81
+ }
82
+ }
83
+ }
84
+ }
85
+ };
86
+ }