nx-json-parser 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/parser.d.ts +24 -1
- package/dist/parser.d.ts.map +1 -1
- package/dist/parser.js +270 -13
- package/dist/parser.js.map +1 -1
- package/dist/plugins/bullet-sections.d.ts.map +1 -1
- package/dist/plugins/bullet-sections.js +42 -72
- package/dist/plugins/bullet-sections.js.map +1 -1
- package/dist/plugins/detect-bullet-mode.d.ts +7 -0
- package/dist/plugins/detect-bullet-mode.d.ts.map +1 -0
- package/dist/plugins/detect-bullet-mode.js +129 -0
- package/dist/plugins/detect-bullet-mode.js.map +1 -0
- package/dist/remark-markdown-parser.d.ts +41 -0
- package/dist/remark-markdown-parser.d.ts.map +1 -0
- package/dist/remark-markdown-parser.js +294 -0
- package/dist/remark-markdown-parser.js.map +1 -0
- package/dist/types.d.ts +17 -3
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +6 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +1 -0
- package/src/parser.ts +302 -15
- package/src/plugins/bullet-sections.ts +58 -83
- package/src/plugins/bullet-sections.ts.old +86 -0
- package/src/plugins/detect-bullet-mode.ts +164 -0
- package/src/remark-markdown-parser.ts +340 -0
- package/src/types.ts +22 -6
- package/test/bullet-mode.test.ts +99 -0
- package/test/parser.2.test.ts +12 -12
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Detect if bullets should be treated as array items or sections
|
|
3
|
+
*/
|
|
4
|
+
import { visit } from 'unist-util-visit';
|
|
5
|
+
import { toString } from 'mdast-util-to-string';
|
|
6
|
+
|
|
7
|
+
export interface BulletModeResult {
|
|
8
|
+
mode: 'array' | 'sections';
|
|
9
|
+
confidence: number;
|
|
10
|
+
reasons: string[];
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const SECTION_KEYWORDS = [
|
|
14
|
+
'answer', 'summary', 'introduction', 'conclusion', 'overview',
|
|
15
|
+
'assumptions', 'unknowns', 'evidence', 'notes', 'details',
|
|
16
|
+
'description', 'background', 'analysis', 'findings', 'recommendations',
|
|
17
|
+
'data', 'identity', 'network', 'security', 'monitoring', 'governance',
|
|
18
|
+
'availability', 'backup', 'patch', 'operational', 'provider',
|
|
19
|
+
'short', 'full', 'long' // Added common answer prefixes
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
export function detectBulletMode(tree: any): BulletModeResult {
|
|
23
|
+
const reasons: string[] = [];
|
|
24
|
+
let sectionScore = 0;
|
|
25
|
+
let arrayScore = 0;
|
|
26
|
+
|
|
27
|
+
let bulletCount = 0;
|
|
28
|
+
let bulletsWithContent = 0;
|
|
29
|
+
let bulletsWithColons = 0;
|
|
30
|
+
let bulletsWithNestedLists = 0;
|
|
31
|
+
let bulletsWithKeywords = 0;
|
|
32
|
+
let totalBulletLength = 0;
|
|
33
|
+
let rootLevelBullets = 0;
|
|
34
|
+
|
|
35
|
+
// Analyze the tree structure
|
|
36
|
+
visit(tree, 'list', (listNode: any, index?: number, parent?: any) => {
|
|
37
|
+
// Only analyze root-level lists or lists directly under root
|
|
38
|
+
const isRootLevel = parent?.type === 'root';
|
|
39
|
+
|
|
40
|
+
if (!isRootLevel) return;
|
|
41
|
+
|
|
42
|
+
rootLevelBullets += listNode.children.length;
|
|
43
|
+
|
|
44
|
+
for (const listItem of listNode.children) {
|
|
45
|
+
bulletCount++;
|
|
46
|
+
|
|
47
|
+
// Get the text of this list item (first paragraph/text only)
|
|
48
|
+
const firstChild = listItem.children[0];
|
|
49
|
+
const itemText = firstChild ? toString(firstChild) : '';
|
|
50
|
+
totalBulletLength += itemText.length;
|
|
51
|
+
|
|
52
|
+
// 1. Check for colons (title pattern: "Short answer:" or "Data protection:")
|
|
53
|
+
if (itemText.includes(':')) {
|
|
54
|
+
bulletsWithColons++;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// 2. Check for content after the bullet (paragraphs, nested items)
|
|
58
|
+
if (listItem.children.length > 1) {
|
|
59
|
+
bulletsWithContent++;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// 3. Check for nested lists
|
|
63
|
+
const hasNestedList = listItem.children.some((child: any) => child.type === 'list');
|
|
64
|
+
if (hasNestedList) {
|
|
65
|
+
bulletsWithNestedLists++;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 4. Check for section keywords
|
|
69
|
+
const lowerText = itemText.toLowerCase();
|
|
70
|
+
if (SECTION_KEYWORDS.some(kw => lowerText.includes(kw))) {
|
|
71
|
+
bulletsWithKeywords++;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
if (bulletCount === 0) {
|
|
77
|
+
return {
|
|
78
|
+
mode: 'array',
|
|
79
|
+
confidence: 0,
|
|
80
|
+
reasons: ['No bullets found']
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const avgLength = totalBulletLength / bulletCount;
|
|
85
|
+
|
|
86
|
+
// === SCORING FOR SECTIONS MODE ===
|
|
87
|
+
|
|
88
|
+
if (bulletsWithColons > 0) {
|
|
89
|
+
const percent = (bulletsWithColons / bulletCount * 100).toFixed(0);
|
|
90
|
+
sectionScore += 3;
|
|
91
|
+
reasons.push(`${bulletsWithColons}/${bulletCount} bullets have colons (${percent}%) - strong section indicator`);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if (bulletsWithContent > 0) {
|
|
95
|
+
const percent = (bulletsWithContent / bulletCount * 100).toFixed(0);
|
|
96
|
+
sectionScore += 3;
|
|
97
|
+
reasons.push(`${bulletsWithContent}/${bulletCount} bullets have content below (${percent}%) - strong section indicator`);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (bulletsWithNestedLists > 0) {
|
|
101
|
+
sectionScore += 2;
|
|
102
|
+
reasons.push(`${bulletsWithNestedLists} bullets have nested lists - section indicator`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (bulletsWithKeywords > 0) {
|
|
106
|
+
// Huge boost for keywords - these are almost certainly section headers
|
|
107
|
+
sectionScore += 10;
|
|
108
|
+
reasons.push(`${bulletsWithKeywords} bullets contain section keywords - STRONG section indicator`);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if (avgLength > 30) {
|
|
112
|
+
sectionScore += 1;
|
|
113
|
+
reasons.push(`Long bullet text (avg ${avgLength.toFixed(0)} chars) - suggests titles`);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (bulletCount <= 5 && (bulletsWithContent > 0 || bulletsWithColons > 0)) {
|
|
117
|
+
sectionScore += 1;
|
|
118
|
+
reasons.push('Few bullets with rich content suggests sections');
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// === SCORING FOR ARRAY MODE ===
|
|
122
|
+
|
|
123
|
+
if (bulletsWithColons === 0 && bulletsWithContent === 0 && bulletsWithNestedLists === 0 && bulletsWithKeywords === 0) {
|
|
124
|
+
// Only if NO keywords are found
|
|
125
|
+
arrayScore += 4;
|
|
126
|
+
reasons.push('No bullets have content, colons, nesting, OR keywords - pure list indicator');
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (avgLength < 30) {
|
|
130
|
+
arrayScore += 2;
|
|
131
|
+
reasons.push(`Short bullet text (avg ${avgLength.toFixed(0)} chars) - suggests list items`);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if (bulletCount >= 3 && bulletsWithContent === 0) {
|
|
135
|
+
arrayScore += 2;
|
|
136
|
+
reasons.push(`${bulletCount} simple bullets without content - array pattern`);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (bulletsWithKeywords === 0) {
|
|
140
|
+
arrayScore += 1;
|
|
141
|
+
reasons.push('No section keywords found');
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Edge case: all bullets are very short and no special features
|
|
145
|
+
if (avgLength < 20 && bulletsWithContent === 0 && bulletsWithColons === 0) {
|
|
146
|
+
arrayScore += 2;
|
|
147
|
+
reasons.push('Very short bullets with no features - definitely array');
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// === DECISION ===
|
|
151
|
+
|
|
152
|
+
const totalScore = sectionScore + arrayScore;
|
|
153
|
+
const mode = sectionScore > arrayScore ? 'sections' : 'array';
|
|
154
|
+
const confidence = totalScore > 0 ? Math.abs(sectionScore - arrayScore) / totalScore : 0;
|
|
155
|
+
|
|
156
|
+
// Format reasons (top 5)
|
|
157
|
+
const topReasons = reasons.slice(0, 5);
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
mode,
|
|
161
|
+
confidence,
|
|
162
|
+
reasons: topReasons
|
|
163
|
+
};
|
|
164
|
+
}
|
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Remark-based Markdown Parser
|
|
3
|
+
* Replaces regex-based parsing with AST-based parsing using unified/remark
|
|
4
|
+
*/
|
|
5
|
+
import { unified } from 'unified';
|
|
6
|
+
import remarkParse from 'remark-parse';
|
|
7
|
+
import remarkGfm from 'remark-gfm';
|
|
8
|
+
import { visit } from 'unist-util-visit';
|
|
9
|
+
import { toString } from 'mdast-util-to-string';
|
|
10
|
+
import { toCamelCase } from 'nx-helpers';
|
|
11
|
+
|
|
12
|
+
export interface MarkdownSection {
|
|
13
|
+
heading: string;
|
|
14
|
+
content: string;
|
|
15
|
+
level: number;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export enum BulletMode {
|
|
19
|
+
ARRAY = 'array',
|
|
20
|
+
SECTIONS = 'sections',
|
|
21
|
+
AUTO = 'auto'
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface RemarkParserOptions {
|
|
25
|
+
bulletMode?: BulletMode;
|
|
26
|
+
sectionKeywords?: string[];
|
|
27
|
+
debug?: boolean;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export class RemarkMarkdownParser {
|
|
31
|
+
private processor: any;
|
|
32
|
+
private options: Required<RemarkParserOptions>;
|
|
33
|
+
|
|
34
|
+
constructor(options: RemarkParserOptions = {}) {
|
|
35
|
+
this.processor = unified()
|
|
36
|
+
.use(remarkParse)
|
|
37
|
+
.use(remarkGfm);
|
|
38
|
+
|
|
39
|
+
this.options = {
|
|
40
|
+
bulletMode: options.bulletMode || BulletMode.AUTO,
|
|
41
|
+
sectionKeywords: options.sectionKeywords || this.getDefaultKeywords(),
|
|
42
|
+
debug: options.debug || false
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Parse markdown into sections
|
|
48
|
+
*/
|
|
49
|
+
parseSections(markdown: string): MarkdownSection[] {
|
|
50
|
+
const tree = this.processor.parse(markdown);
|
|
51
|
+
|
|
52
|
+
// Detect bullet mode if AUTO
|
|
53
|
+
const bulletMode = this.options.bulletMode === BulletMode.AUTO
|
|
54
|
+
? this.detectBulletMode(tree)
|
|
55
|
+
: this.options.bulletMode;
|
|
56
|
+
|
|
57
|
+
if (this.options.debug) {
|
|
58
|
+
console.log(`📋 Bullet mode: ${bulletMode}`);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (bulletMode === BulletMode.SECTIONS) {
|
|
62
|
+
return this.parseSectionsMode(tree, markdown);
|
|
63
|
+
} else {
|
|
64
|
+
return this.parseHeadingMode(tree);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Parse content (tables, lists, text)
|
|
70
|
+
*/
|
|
71
|
+
parseContent(content: string): any {
|
|
72
|
+
const trimmed = content.trim();
|
|
73
|
+
if (!trimmed) return '';
|
|
74
|
+
|
|
75
|
+
const tree = this.processor.parse(trimmed);
|
|
76
|
+
|
|
77
|
+
// Single node - return specific type
|
|
78
|
+
if (tree.children.length === 1) {
|
|
79
|
+
const child = tree.children[0];
|
|
80
|
+
|
|
81
|
+
if (child.type === 'table') {
|
|
82
|
+
return this.parseTable(child);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (child.type === 'list') {
|
|
86
|
+
return this.parseList(child);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (child.type === 'paragraph') {
|
|
90
|
+
return toString(child);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Multiple children - combine
|
|
95
|
+
const results: any[] = [];
|
|
96
|
+
|
|
97
|
+
for (const child of tree.children) {
|
|
98
|
+
if (child.type === 'table') {
|
|
99
|
+
return this.parseTable(child); // Single table dominates
|
|
100
|
+
} else if (child.type === 'list') {
|
|
101
|
+
return this.parseList(child); // Single list dominates
|
|
102
|
+
} else if (child.type === 'paragraph') {
|
|
103
|
+
results.push(toString(child));
|
|
104
|
+
} else {
|
|
105
|
+
results.push(toString(child));
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Join text results
|
|
110
|
+
const text = results.join('\n\n').trim();
|
|
111
|
+
return text || trimmed;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Convert sections to object
|
|
116
|
+
*/
|
|
117
|
+
sectionsToObject(sections: MarkdownSection[]): Record<string, any> {
|
|
118
|
+
const result: Record<string, any> = {};
|
|
119
|
+
|
|
120
|
+
for (const section of sections) {
|
|
121
|
+
const key = toCamelCase(section.heading);
|
|
122
|
+
result[key] = this.parseContent(section.content);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return result;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// ========================================================================
|
|
129
|
+
// PRIVATE METHODS - Mode Detection
|
|
130
|
+
// ========================================================================
|
|
131
|
+
|
|
132
|
+
private detectBulletMode(tree: any): BulletMode {
|
|
133
|
+
let bulletCount = 0;
|
|
134
|
+
let bulletsWithContent = 0;
|
|
135
|
+
let bulletsWithColons = 0;
|
|
136
|
+
let bulletsWithNestedLists = 0;
|
|
137
|
+
let bulletsWithKeywords = 0;
|
|
138
|
+
let totalLength = 0;
|
|
139
|
+
|
|
140
|
+
visit(tree, 'list', (listNode: any, index?: number, parent?: any) => {
|
|
141
|
+
// Only analyze root-level lists
|
|
142
|
+
if (!parent || parent.type !== 'root') return;
|
|
143
|
+
|
|
144
|
+
for (const listItem of listNode.children) {
|
|
145
|
+
bulletCount++;
|
|
146
|
+
|
|
147
|
+
const firstChild = listItem.children[0];
|
|
148
|
+
const text = firstChild ? toString(firstChild) : '';
|
|
149
|
+
totalLength += text.length;
|
|
150
|
+
|
|
151
|
+
// Check indicators
|
|
152
|
+
if (text.includes(':')) bulletsWithColons++;
|
|
153
|
+
if (listItem.children.length > 1) bulletsWithContent++;
|
|
154
|
+
|
|
155
|
+
const hasNestedList = listItem.children.some((c: any) => c.type === 'list');
|
|
156
|
+
if (hasNestedList) bulletsWithNestedLists++;
|
|
157
|
+
|
|
158
|
+
const lowerText = text.toLowerCase();
|
|
159
|
+
if (this.options.sectionKeywords.some(kw => lowerText.includes(kw))) {
|
|
160
|
+
bulletsWithKeywords++;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
if (bulletCount === 0) return BulletMode.ARRAY;
|
|
166
|
+
|
|
167
|
+
const avgLength = totalLength / bulletCount;
|
|
168
|
+
|
|
169
|
+
// Scoring
|
|
170
|
+
let sectionScore = 0;
|
|
171
|
+
let arrayScore = 0;
|
|
172
|
+
|
|
173
|
+
if (bulletsWithColons > 0) sectionScore += 3;
|
|
174
|
+
if (bulletsWithContent > 0) sectionScore += 3;
|
|
175
|
+
if (bulletsWithNestedLists > 0) sectionScore += 2;
|
|
176
|
+
if (bulletsWithKeywords > 0) sectionScore += 2;
|
|
177
|
+
if (avgLength > 30) sectionScore += 1;
|
|
178
|
+
|
|
179
|
+
if (bulletsWithContent === 0 && bulletsWithColons === 0) arrayScore += 4;
|
|
180
|
+
if (avgLength < 30) arrayScore += 2;
|
|
181
|
+
if (bulletCount >= 3 && bulletsWithContent === 0) arrayScore += 2;
|
|
182
|
+
|
|
183
|
+
if (this.options.debug) {
|
|
184
|
+
console.log(`🔍 Detection scores - Sections: ${sectionScore}, Array: ${arrayScore}`);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return sectionScore > arrayScore ? BulletMode.SECTIONS : BulletMode.ARRAY;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// ========================================================================
|
|
191
|
+
// PRIVATE METHODS - Parsing Modes
|
|
192
|
+
// ========================================================================
|
|
193
|
+
|
|
194
|
+
private parseHeadingMode(tree: any): MarkdownSection[] {
|
|
195
|
+
const sections: MarkdownSection[] = [];
|
|
196
|
+
let currentSection: MarkdownSection | null = null;
|
|
197
|
+
let currentContent: string[] = [];
|
|
198
|
+
|
|
199
|
+
for (const node of tree.children) {
|
|
200
|
+
if (node.type === 'heading') {
|
|
201
|
+
// Save previous section
|
|
202
|
+
if (currentSection) {
|
|
203
|
+
currentSection.content = currentContent.join('\n\n').trim();
|
|
204
|
+
sections.push(currentSection);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Start new section
|
|
208
|
+
currentSection = {
|
|
209
|
+
heading: toString(node),
|
|
210
|
+
content: '',
|
|
211
|
+
level: node.depth
|
|
212
|
+
};
|
|
213
|
+
currentContent = [];
|
|
214
|
+
} else if (currentSection) {
|
|
215
|
+
// Add to current section content
|
|
216
|
+
currentContent.push(this.nodeToString(node));
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Save last section
|
|
221
|
+
if (currentSection) {
|
|
222
|
+
currentSection.content = currentContent.join('\n\n').trim();
|
|
223
|
+
sections.push(currentSection);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return sections;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
private parseSectionsMode(tree: any, markdown: string): MarkdownSection[] {
|
|
230
|
+
const sections: MarkdownSection[] = [];
|
|
231
|
+
let inList = false;
|
|
232
|
+
|
|
233
|
+
for (const node of tree.children) {
|
|
234
|
+
if (node.type === 'heading') {
|
|
235
|
+
// Regular heading
|
|
236
|
+
const heading = toString(node);
|
|
237
|
+
const content = ''; // Will be filled by next nodes
|
|
238
|
+
sections.push({
|
|
239
|
+
heading,
|
|
240
|
+
content,
|
|
241
|
+
level: node.depth
|
|
242
|
+
});
|
|
243
|
+
} else if (node.type === 'list') {
|
|
244
|
+
// Process each list item as a section
|
|
245
|
+
for (const listItem of node.children) {
|
|
246
|
+
const section = this.listItemToSection(listItem);
|
|
247
|
+
if (section) {
|
|
248
|
+
sections.push(section);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
} else if (sections.length > 0) {
|
|
252
|
+
// Add content to last section
|
|
253
|
+
const lastSection = sections[sections.length - 1];
|
|
254
|
+
const nodeContent = this.nodeToString(node);
|
|
255
|
+
lastSection.content = lastSection.content
|
|
256
|
+
? `${lastSection.content}\n\n${nodeContent}`
|
|
257
|
+
: nodeContent;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return sections;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
private listItemToSection(listItem: any): MarkdownSection | null {
|
|
265
|
+
if (!listItem.children || listItem.children.length === 0) {
|
|
266
|
+
return null;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const firstChild = listItem.children[0];
|
|
270
|
+
const heading = toString(firstChild).replace(/:$/, ''); // Remove trailing colon
|
|
271
|
+
|
|
272
|
+
const contentNodes = listItem.children.slice(1);
|
|
273
|
+
let content = '';
|
|
274
|
+
|
|
275
|
+
if (contentNodes.length > 0) {
|
|
276
|
+
content = contentNodes.map((node: any) => this.nodeToString(node)).join('\n\n').trim();
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
return {
|
|
280
|
+
heading,
|
|
281
|
+
content,
|
|
282
|
+
level: 1
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// ========================================================================
|
|
287
|
+
// PRIVATE METHODS - Content Parsing
|
|
288
|
+
// ========================================================================
|
|
289
|
+
|
|
290
|
+
private parseTable(tableNode: any): any[] {
|
|
291
|
+
const rows = tableNode.children;
|
|
292
|
+
if (rows.length === 0) return [];
|
|
293
|
+
|
|
294
|
+
// First row = headers
|
|
295
|
+
const headerRow = rows[0];
|
|
296
|
+
const headers = headerRow.children.map((cell: any) =>
|
|
297
|
+
toCamelCase(toString(cell).trim())
|
|
298
|
+
);
|
|
299
|
+
|
|
300
|
+
// Data rows
|
|
301
|
+
return rows.slice(1).map((row: any) => {
|
|
302
|
+
const obj: any = {};
|
|
303
|
+
row.children.forEach((cell: any, i: number) => {
|
|
304
|
+
const key = headers[i] || `column${i}`;
|
|
305
|
+
obj[key] = toString(cell).trim();
|
|
306
|
+
});
|
|
307
|
+
return obj;
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
private parseList(listNode: any): string[] {
|
|
312
|
+
return listNode.children.map((item: any) => {
|
|
313
|
+
// Get first child only (ignore nested content for simple lists)
|
|
314
|
+
const firstChild = item.children[0];
|
|
315
|
+
return toString(firstChild).trim();
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
private nodeToString(node: any): string {
|
|
320
|
+
if (node.type === 'table') {
|
|
321
|
+
// Return markdown representation or JSON
|
|
322
|
+
return JSON.stringify(this.parseTable(node));
|
|
323
|
+
}
|
|
324
|
+
if (node.type === 'list') {
|
|
325
|
+
// Return as list
|
|
326
|
+
return this.parseList(node).map(item => `- ${item}`).join('\n');
|
|
327
|
+
}
|
|
328
|
+
return toString(node);
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
private getDefaultKeywords(): string[] {
|
|
332
|
+
return [
|
|
333
|
+
'answer', 'summary', 'introduction', 'conclusion', 'overview',
|
|
334
|
+
'assumptions', 'unknowns', 'evidence', 'notes', 'details',
|
|
335
|
+
'description', 'background', 'analysis', 'findings', 'recommendations',
|
|
336
|
+
'data', 'identity', 'network', 'security', 'monitoring', 'governance',
|
|
337
|
+
'availability', 'backup', 'patch', 'operational', 'provider'
|
|
338
|
+
];
|
|
339
|
+
}
|
|
340
|
+
}
|
package/src/types.ts
CHANGED
|
@@ -1,13 +1,29 @@
|
|
|
1
|
+
export enum BulletMode {
|
|
2
|
+
ARRAY = 'array',
|
|
3
|
+
SECTIONS = 'sections',
|
|
4
|
+
AUTO = 'auto'
|
|
5
|
+
}
|
|
6
|
+
|
|
1
7
|
export interface MarkdownSection {
|
|
2
8
|
heading: string;
|
|
3
|
-
content: any;
|
|
9
|
+
content: any;
|
|
4
10
|
level: number;
|
|
5
|
-
format
|
|
11
|
+
format?: 'heading' | 'bullet' | 'text';
|
|
6
12
|
}
|
|
7
13
|
|
|
8
|
-
export
|
|
14
|
+
export interface ParserOptions {
|
|
15
|
+
bulletMode?: BulletMode;
|
|
16
|
+
debug?: boolean;
|
|
17
|
+
}
|
|
9
18
|
|
|
10
|
-
export interface
|
|
11
|
-
|
|
12
|
-
|
|
19
|
+
export interface BulletModeResult {
|
|
20
|
+
mode: 'array' | 'sections';
|
|
21
|
+
confidence: number;
|
|
22
|
+
reasons: string[];
|
|
13
23
|
}
|
|
24
|
+
|
|
25
|
+
// ADD THIS:
|
|
26
|
+
export interface ParseResult {
|
|
27
|
+
sections?: MarkdownSection[];
|
|
28
|
+
[key: string]: any; // Allow any string keys with any values
|
|
29
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { markdownToJson } from '../src/index.js';
|
|
2
|
+
import { RemarkParser } from '../src/parser.js';
|
|
3
|
+
import { BulletMode } from '../src/types.js';
|
|
4
|
+
|
|
5
|
+
describe('Bullet Mode Detection', () => {
|
|
6
|
+
|
|
7
|
+
it('should auto-detect simple arrays', () => {
|
|
8
|
+
const md = `
|
|
9
|
+
- Item 1
|
|
10
|
+
- Item 2
|
|
11
|
+
- Item 3
|
|
12
|
+
`;
|
|
13
|
+
const result = markdownToJson(md);
|
|
14
|
+
// Should be an array, or if it has no heading, maybe it's just the root content?
|
|
15
|
+
// In current implementation: "No sections found, put everything in root".
|
|
16
|
+
// Root content for a list in ARRAY mode is string[].
|
|
17
|
+
|
|
18
|
+
// However, JSONTransformer converts sections to an object.
|
|
19
|
+
// If there is only 'Root' section, markdownToJson returns { root: [...] } ?
|
|
20
|
+
// Let's check JSONTransformer logic.
|
|
21
|
+
// keys are camelCase headings. 'Root' -> 'root'.
|
|
22
|
+
|
|
23
|
+
expect(result.root).toEqual(['Item 1', 'Item 2', 'Item 3']);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it('should auto-detect sections', () => {
|
|
27
|
+
const md = `
|
|
28
|
+
- Short Answer
|
|
29
|
+
The sky is blue.
|
|
30
|
+
|
|
31
|
+
- Evidence
|
|
32
|
+
1. Look up.
|
|
33
|
+
2. See blue.
|
|
34
|
+
`;
|
|
35
|
+
const result = markdownToJson(md);
|
|
36
|
+
// Should explode into keys
|
|
37
|
+
expect(result.shortAnswer).toBe('The sky is blue.');
|
|
38
|
+
// Evidence has a nested list.
|
|
39
|
+
// In SECTIONS mode, nested list becomes content.
|
|
40
|
+
// processBulletAsSection logic: "Check for nested list... ONLY a nested list -> convert to array"
|
|
41
|
+
// OR "Mixed content -> object { text, items }"
|
|
42
|
+
// Here we have "1. Look up..." which is an ordered list.
|
|
43
|
+
// Our logic handles 'list' nodes inside the bullet item.
|
|
44
|
+
|
|
45
|
+
// Wait, "1. Look up" is an ordered list. detect-bullet-mode uses 'list' type check.
|
|
46
|
+
// processBulletAsSection finds `node.type === 'list'`.
|
|
47
|
+
|
|
48
|
+
expect(result.evidence).toBeDefined();
|
|
49
|
+
expect(Array.isArray(result.evidence)).toBe(true);
|
|
50
|
+
expect(result.evidence[0]).toContain('Look up');
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it('should respect manual override to ARRAY', () => {
|
|
54
|
+
// A string that LOOKS like sections but we force ARRAY
|
|
55
|
+
const md = `
|
|
56
|
+
- Short Answer
|
|
57
|
+
- Evidence
|
|
58
|
+
`;
|
|
59
|
+
// If auto, might think it's array (no content).
|
|
60
|
+
// Let's make it look like sections:
|
|
61
|
+
const mdSec = `
|
|
62
|
+
- Section A:
|
|
63
|
+
Content A
|
|
64
|
+
- Section B:
|
|
65
|
+
Content B
|
|
66
|
+
`;
|
|
67
|
+
|
|
68
|
+
const parser = new RemarkParser({ bulletMode: BulletMode.ARRAY });
|
|
69
|
+
// We need to bypass JSONTransformer for a direct parser test, or pass parser to transformer
|
|
70
|
+
// JSONTransformer can take a parser in options.
|
|
71
|
+
|
|
72
|
+
// But markdownToJson doesn't expose options. We'll use the class directly.
|
|
73
|
+
const sections = parser.parse(mdSec);
|
|
74
|
+
|
|
75
|
+
// In ARRAY mode, it should be treated as a single list under 'Root' (or wherever it is)
|
|
76
|
+
// and the content should be a flattened string array of the bullets text.
|
|
77
|
+
|
|
78
|
+
expect(sections.length).toBe(1);
|
|
79
|
+
expect(sections[0]?.heading).toBe('Root');
|
|
80
|
+
expect(Array.isArray(sections[0]?.content)).toBe(true);
|
|
81
|
+
expect(sections[0]?.content[0]).toContain('Section A'); // Just the text
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('should respect manual override to SECTIONS', () => {
|
|
85
|
+
// A string that LOOKS like array
|
|
86
|
+
const md = `
|
|
87
|
+
- Item 1
|
|
88
|
+
- Item 2
|
|
89
|
+
`;
|
|
90
|
+
const parser = new RemarkParser({ bulletMode: BulletMode.SECTIONS });
|
|
91
|
+
const sections = parser.parse(md);
|
|
92
|
+
|
|
93
|
+
// Should try to explode them.
|
|
94
|
+
// "Item 1" becomes a heading, content empty.
|
|
95
|
+
expect(sections.length).toBe(2);
|
|
96
|
+
expect(sections[0]?.heading).toBe('Item 1');
|
|
97
|
+
expect(sections[1]?.heading).toBe('Item 2');
|
|
98
|
+
});
|
|
99
|
+
});
|
package/test/parser.2.test.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { markdownToJson } from '../src/index.js';
|
|
2
2
|
|
|
3
3
|
describe('nx-json-parser - Complex Input', () => {
|
|
4
|
-
|
|
4
|
+
const str = `- Short answer
|
|
5
5
|
Key considerations include data protection (encryption at rest and in transit), strong access control and IAM, secure network design, configuration and patch management, monitoring and logging, backups and disaster recovery, compliance and governance, and a clear shared responsibility model with incident response planning.
|
|
6
6
|
|
|
7
7
|
- Full answer
|
|
@@ -87,17 +87,17 @@ Deploying a database in the cloud shifts part of the security burden to the clou
|
|
|
87
87
|
11. Data residency considerations: Data localization requirements can influence data storage and transfer controls.
|
|
88
88
|
12. Vendor/service model considerations: Clear understanding of the provider’s and customer’s security responsibilities and controls is essential for effective security posture.`;
|
|
89
89
|
|
|
90
|
-
|
|
91
|
-
|
|
90
|
+
it('should parse the large security overview correctly', () => {
|
|
91
|
+
const result = markdownToJson(str);
|
|
92
92
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
93
|
+
expect(result.shortAnswer).toBeDefined();
|
|
94
|
+
expect(result.fullAnswer).toBeDefined();
|
|
95
|
+
expect(result.dataProtection).toBeDefined();
|
|
96
|
+
expect(result.accessControlAndIdentityManagement).toBeDefined();
|
|
97
|
+
expect(result.assumptions).toBeInstanceOf(Array);
|
|
98
|
+
expect(result.evidence).toBeInstanceOf(Array);
|
|
99
99
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
100
|
+
expect((result.evidence as string[]).length).toBe(12);
|
|
101
|
+
expect(result.shortAnswer).toContain('Key considerations include data protection');
|
|
102
|
+
});
|
|
103
103
|
});
|