@incremark/core 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@incremark/core",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "description": "增量式 Markdown 解析器核心库",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -23,7 +23,7 @@
23
23
  },
24
24
  "files": [
25
25
  "dist",
26
- "src"
26
+ "src/transformer/styles.css"
27
27
  ],
28
28
  "dependencies": {
29
29
  "@types/mdast": "^4.0.0",
@@ -1,214 +0,0 @@
1
- /**
2
- * 脚注解析测试
3
- *
4
- * 测试增量解析场景下的脚注引用和定义
5
- */
6
-
7
- import { describe, it, expect } from 'vitest'
8
- import { createIncremarkParser } from '../parser/IncremarkParser'
9
-
10
- describe('Footnote Parsing', () => {
11
- describe('Basic Footnote', () => {
12
- it('should parse footnote reference before definition', () => {
13
- const markdown = `这是一个简单的脚注[^1]。
14
-
15
- [^1]: 这是第一个脚注的内容。`
16
-
17
- const parser = createIncremarkParser({ gfm: true })
18
- const result = parser.render(markdown)
19
- const ast = result.ast
20
-
21
- // 检查是否有脚注引用
22
- const paragraph = ast.children[0]
23
- expect(paragraph.type).toBe('paragraph')
24
-
25
- const hasFootnoteRef = paragraph.children?.some(
26
- (node: any) => node.type === 'footnoteReference' && node.identifier === '1'
27
- )
28
- expect(hasFootnoteRef).toBe(true)
29
-
30
- // 检查是否有脚注定义
31
- const hasFootnoteDef = ast.children.some(
32
- (node: any) => node.type === 'footnoteDefinition' && node.identifier === '1'
33
- )
34
- expect(hasFootnoteDef).toBe(true)
35
- })
36
-
37
- it('should parse multiple footnotes', () => {
38
- const markdown = `第一个脚注[^1],第二个脚注[^2]。
39
-
40
- [^1]: 第一个内容。
41
- [^2]: 第二个内容。`
42
-
43
- const parser = createIncremarkParser({ gfm: true })
44
- const result = parser.render(markdown)
45
- const ast = result.ast
46
-
47
- // 检查两个脚注引用
48
- const paragraph = ast.children[0]
49
- const footnoteRefs = paragraph.children?.filter(
50
- (node: any) => node.type === 'footnoteReference'
51
- )
52
- expect(footnoteRefs).toHaveLength(2)
53
- expect(footnoteRefs?.[0].identifier).toBe('1')
54
- expect(footnoteRefs?.[1].identifier).toBe('2')
55
-
56
- // 检查两个脚注定义
57
- const footnoteDefs = ast.children.filter(
58
- (node: any) => node.type === 'footnoteDefinition'
59
- )
60
- expect(footnoteDefs).toHaveLength(2)
61
- expect(footnoteDefs[0].identifier).toBe('1')
62
- expect(footnoteDefs[1].identifier).toBe('2')
63
- })
64
- })
65
-
66
- describe('Multiline Footnote', () => {
67
- it('should parse multiline footnote content', () => {
68
- const markdown = `多行脚注[^long]。
69
-
70
- [^long]: 第一段内容。
71
-
72
- 第二段内容(缩进)。`
73
-
74
- const parser = createIncremarkParser({ gfm: true })
75
- const result = parser.render(markdown)
76
- const ast = result.ast
77
-
78
- // 检查脚注引用
79
- const paragraph = ast.children[0]
80
- const hasFootnoteRef = paragraph.children?.some(
81
- (node: any) => node.type === 'footnoteReference' && node.identifier === 'long'
82
- )
83
- expect(hasFootnoteRef).toBe(true)
84
-
85
- // 检查脚注定义
86
- const footnoteDef = ast.children.find(
87
- (node: any) => node.type === 'footnoteDefinition' && node.identifier === 'long'
88
- ) as any
89
- expect(footnoteDef).toBeDefined()
90
-
91
- // 检查脚注内容是否包含多个段落
92
- expect(footnoteDef.children.length).toBeGreaterThan(1)
93
- })
94
- })
95
-
96
- describe('Incremental Parsing', () => {
97
- it('should handle footnote reference in pending block', () => {
98
- const parser = createIncremarkParser({ gfm: true })
99
-
100
- // 第一次追加:只有引用
101
- const update1 = parser.append('这是一个脚注[^1]。\n\n')
102
-
103
- // 检查 pending blocks 中是否有脚注引用
104
- const pendingParagraph = update1.pending[0]?.node
105
- const hasPendingRef = pendingParagraph?.children?.some(
106
- (node: any) => node.type === 'footnoteReference'
107
- )
108
- expect(hasPendingRef).toBe(true)
109
-
110
- // 第二次追加:添加定义
111
- const update2 = parser.append('[^1]: 脚注内容。')
112
- parser.finalize()
113
-
114
- // 检查最终 AST
115
- const ast = parser.getAst()
116
- const hasFootnoteDef = ast.children.some(
117
- (node: any) => node.type === 'footnoteDefinition' && node.identifier === '1'
118
- )
119
- expect(hasFootnoteDef).toBe(true)
120
- })
121
-
122
- it('should handle definition before reference', () => {
123
- const markdown = `[^1]: 脚注内容。
124
-
125
- 这是一个脚注[^1]。`
126
-
127
- const parser = createIncremarkParser({ gfm: true })
128
- const result = parser.render(markdown)
129
- const ast = result.ast
130
-
131
- // 即使定义在前,引用也应该被正确解析
132
- const paragraph = ast.children.find((node: any) => node.type === 'paragraph')
133
- const hasFootnoteRef = paragraph?.children?.some(
134
- (node: any) => node.type === 'footnoteReference' && node.identifier === '1'
135
- )
136
- expect(hasFootnoteRef).toBe(true)
137
-
138
- const hasFootnoteDef = ast.children.some(
139
- (node: any) => node.type === 'footnoteDefinition' && node.identifier === '1'
140
- )
141
- expect(hasFootnoteDef).toBe(true)
142
- })
143
- })
144
-
145
- describe('Edge Cases', () => {
146
- it('should handle footnote with special characters in identifier', () => {
147
- const markdown = `脚注[^note-1]。
148
-
149
- [^note-1]: 内容。`
150
-
151
- const parser = createIncremarkParser({ gfm: true })
152
- const result = parser.render(markdown)
153
- const ast = result.ast
154
-
155
- const paragraph = ast.children[0]
156
- const hasFootnoteRef = paragraph.children?.some(
157
- (node: any) => node.type === 'footnoteReference' && node.identifier === 'note-1'
158
- )
159
- expect(hasFootnoteRef).toBe(true)
160
- })
161
-
162
- it('should handle footnote with markdown in content', () => {
163
- const markdown = `脚注[^complex]。
164
-
165
- [^complex]: 包含 **粗体** 和 *斜体*。`
166
-
167
- const parser = createIncremarkParser({ gfm: true })
168
- const result = parser.render(markdown)
169
- const ast = result.ast
170
-
171
- const footnoteDef = ast.children.find(
172
- (node: any) => node.type === 'footnoteDefinition' && node.identifier === 'complex'
173
- ) as any
174
- expect(footnoteDef).toBeDefined()
175
-
176
- // 检查脚注内容是否包含格式化文本
177
- const paragraph = footnoteDef.children[0]
178
- const hasStrong = paragraph.children?.some((node: any) => node.type === 'strong')
179
- const hasEmphasis = paragraph.children?.some((node: any) => node.type === 'emphasis')
180
- expect(hasStrong || hasEmphasis).toBe(true)
181
- })
182
-
183
- it('should not parse invalid footnote syntax', () => {
184
- const markdown = `这不是脚注[^ 1]。`
185
-
186
- const parser = createIncremarkParser({ gfm: true })
187
- const result = parser.render(markdown)
188
- const ast = result.ast
189
-
190
- // 空格会导致解析失败,应该被当作普通文本
191
- const paragraph = ast.children[0]
192
- const hasFootnoteRef = paragraph.children?.some(
193
- (node: any) => node.type === 'footnoteReference'
194
- )
195
- expect(hasFootnoteRef).toBe(false)
196
- })
197
- })
198
-
199
- describe('Footnote Reference Order', () => {
200
- it('should track footnote reference order', () => {
201
- const markdown = `第二个[^2]出现在第一个[^1]之前。
202
-
203
- [^1]: 第一个定义。
204
- [^2]: 第二个定义。`
205
-
206
- const parser = createIncremarkParser({ gfm: true })
207
- const result = parser.render(markdown)
208
-
209
- // 检查引用顺序
210
- expect(result.footnoteReferenceOrder).toEqual(['2', '1'])
211
- })
212
- })
213
- })
214
-
@@ -1,443 +0,0 @@
1
- /**
2
- * Incremark vs Traditional Parser Benchmark
3
- *
4
- * 对比增量解析和传统解析(每次重新解析全部内容)的性能差异
5
- */
6
-
7
- import { IncremarkParser } from '../parser/IncremarkParser'
8
- import { fromMarkdown } from 'mdast-util-from-markdown'
9
- import { gfm } from 'micromark-extension-gfm'
10
- import { gfmFromMarkdown } from 'mdast-util-gfm'
11
-
12
- // 短文本测试(~800 字符)
13
- const shortMarkdown = `
14
- # Hello World
15
-
16
- This is a paragraph with **bold** and *italic* text.
17
-
18
- ## Code Example
19
-
20
- \`\`\`javascript
21
- function hello() {
22
- console.log('Hello, World!');
23
- return {
24
- name: 'test',
25
- value: 42
26
- };
27
- }
28
- \`\`\`
29
-
30
- ## List Example
31
-
32
- - Item 1
33
- - Item 2
34
- - Nested item 2.1
35
- - Nested item 2.2
36
- - Item 3
37
-
38
- ## Table Example
39
-
40
- | Name | Age | City |
41
- |------|-----|------|
42
- | Alice | 25 | NYC |
43
- | Bob | 30 | LA |
44
-
45
- ## Blockquote
46
-
47
- > This is a quote
48
- > with multiple lines
49
- > and **formatted** text
50
-
51
- ## More Content
52
-
53
- Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
54
-
55
- ### Subsection
56
-
57
- More text here with [links](https://example.com) and \`inline code\`.
58
-
59
- 1. Ordered item 1
60
- 2. Ordered item 2
61
- 3. Ordered item 3
62
-
63
- ---
64
-
65
- The end.
66
- `
67
-
68
- // 生成长文本(模拟真实 AI 输出)
69
- function generateLongMarkdown(targetLength: number): string {
70
- const sections = [
71
- `
72
- # Introduction to Machine Learning
73
-
74
- Machine learning is a subset of artificial intelligence (AI) that provides systems the ability to automatically learn and improve from experience without being explicitly programmed.
75
-
76
- ## Key Concepts
77
-
78
- ### Supervised Learning
79
-
80
- In supervised learning, the algorithm learns from labeled training data, and makes predictions based on that data. Common algorithms include:
81
-
82
- - **Linear Regression** - For predicting continuous values
83
- - **Logistic Regression** - For classification problems
84
- - **Decision Trees** - For both classification and regression
85
- - **Random Forest** - Ensemble method using multiple decision trees
86
- - **Support Vector Machines** - For classification with clear margins
87
-
88
- \`\`\`python
89
- from sklearn.model_selection import train_test_split
90
- from sklearn.ensemble import RandomForestClassifier
91
-
92
- # Split the data
93
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
94
-
95
- # Train the model
96
- model = RandomForestClassifier(n_estimators=100)
97
- model.fit(X_train, y_train)
98
-
99
- # Make predictions
100
- predictions = model.predict(X_test)
101
- \`\`\`
102
-
103
- ### Unsupervised Learning
104
-
105
- Unsupervised learning deals with unlabeled data. The algorithm tries to find patterns and relationships in the data.
106
-
107
- | Algorithm | Use Case | Complexity |
108
- |-----------|----------|------------|
109
- | K-Means | Clustering | O(n*k*i) |
110
- | DBSCAN | Density clustering | O(n log n) |
111
- | PCA | Dimensionality reduction | O(n*d²) |
112
- | t-SNE | Visualization | O(n²) |
113
-
114
- > "The goal of unsupervised learning is to discover hidden patterns or data groupings without the need for human intervention." - Andrew Ng
115
-
116
- `,
117
- `
118
- ## Deep Learning
119
-
120
- Deep learning is a subset of machine learning that uses neural networks with many layers.
121
-
122
- ### Neural Network Architecture
123
-
124
- \`\`\`
125
- Input Layer → Hidden Layer 1 → Hidden Layer 2 → ... → Output Layer
126
- ↓ ↓ ↓ ↓
127
- Features Activations Activations Predictions
128
- \`\`\`
129
-
130
- ### Common Activation Functions
131
-
132
- 1. **ReLU (Rectified Linear Unit)**
133
- - Formula: \`f(x) = max(0, x)\`
134
- - Most commonly used in hidden layers
135
-
136
- 2. **Sigmoid**
137
- - Formula: \`f(x) = 1 / (1 + e^(-x))\`
138
- - Used for binary classification
139
-
140
- 3. **Softmax**
141
- - Used for multi-class classification
142
- - Outputs probability distribution
143
-
144
- \`\`\`python
145
- import torch
146
- import torch.nn as nn
147
-
148
- class NeuralNetwork(nn.Module):
149
- def __init__(self, input_size, hidden_size, num_classes):
150
- super(NeuralNetwork, self).__init__()
151
- self.layer1 = nn.Linear(input_size, hidden_size)
152
- self.relu = nn.ReLU()
153
- self.layer2 = nn.Linear(hidden_size, num_classes)
154
-
155
- def forward(self, x):
156
- out = self.layer1(x)
157
- out = self.relu(out)
158
- out = self.layer2(out)
159
- return out
160
- \`\`\`
161
-
162
- `,
163
- `
164
- ## Natural Language Processing
165
-
166
- NLP is a field of AI that focuses on the interaction between computers and humans through natural language.
167
-
168
- ### Key Tasks
169
-
170
- - **Text Classification** - Categorizing text into predefined categories
171
- - **Named Entity Recognition** - Identifying entities like names, locations, organizations
172
- - **Sentiment Analysis** - Determining the emotional tone of text
173
- - **Machine Translation** - Translating text from one language to another
174
- - **Question Answering** - Answering questions based on context
175
-
176
- ### Transformer Architecture
177
-
178
- The transformer architecture revolutionized NLP with the introduction of self-attention mechanisms.
179
-
180
- \`\`\`
181
- ┌─────────────────────────────────────┐
182
- │ Transformer │
183
- ├─────────────────────────────────────┤
184
- │ ┌─────────────┐ ┌─────────────┐ │
185
- │ │ Encoder │ │ Decoder │ │
186
- │ │ │ │ │ │
187
- │ │ Self-Attn │ │ Self-Attn │ │
188
- │ │ Feed-Forward│ │ Cross-Attn │ │
189
- │ │ │ │ Feed-Forward│ │
190
- │ └─────────────┘ └─────────────┘ │
191
- └─────────────────────────────────────┘
192
- \`\`\`
193
-
194
- > Transformers have become the foundation for large language models like GPT, BERT, and Claude.
195
-
196
- `,
197
- `
198
- ## Best Practices
199
-
200
- ### Data Preprocessing
201
-
202
- 1. Handle missing values appropriately
203
- 2. Normalize or standardize numerical features
204
- 3. Encode categorical variables
205
- 4. Split data into train/validation/test sets
206
- 5. Apply data augmentation when appropriate
207
-
208
- ### Model Evaluation
209
-
210
- | Metric | Formula | Use Case |
211
- |--------|---------|----------|
212
- | Accuracy | (TP+TN)/(TP+TN+FP+FN) | Balanced classes |
213
- | Precision | TP/(TP+FP) | When FP is costly |
214
- | Recall | TP/(TP+FN) | When FN is costly |
215
- | F1 Score | 2*(P*R)/(P+R) | Imbalanced classes |
216
- | AUC-ROC | Area under ROC curve | Binary classification |
217
-
218
- ### Hyperparameter Tuning
219
-
220
- \`\`\`python
221
- from sklearn.model_selection import GridSearchCV
222
-
223
- param_grid = {
224
- 'n_estimators': [100, 200, 300],
225
- 'max_depth': [10, 20, 30, None],
226
- 'min_samples_split': [2, 5, 10],
227
- 'min_samples_leaf': [1, 2, 4]
228
- }
229
-
230
- grid_search = GridSearchCV(
231
- estimator=RandomForestClassifier(),
232
- param_grid=param_grid,
233
- cv=5,
234
- n_jobs=-1,
235
- verbose=2
236
- )
237
-
238
- grid_search.fit(X_train, y_train)
239
- print(f"Best parameters: {grid_search.best_params_}")
240
- \`\`\`
241
-
242
- ---
243
-
244
- This concludes our overview of machine learning fundamentals.
245
-
246
- `
247
- ]
248
-
249
- let result = ''
250
- let sectionIndex = 0
251
-
252
- while (result.length < targetLength) {
253
- result += sections[sectionIndex % sections.length]
254
- sectionIndex++
255
- }
256
-
257
- return result.slice(0, targetLength)
258
- }
259
-
260
- // 默认测试用的 Markdown 内容
261
- const testMarkdown = shortMarkdown
262
-
263
- interface BenchmarkResult {
264
- name: string
265
- totalTime: number
266
- parseCount: number
267
- avgTimePerParse: number
268
- totalCharsParsed: number
269
- }
270
-
271
- /**
272
- * 模拟流式输入,将文本按 chunk 大小分割
273
- */
274
- function simulateStream(text: string, chunkSize: number): string[] {
275
- const chunks: string[] = []
276
- for (let i = 0; i < text.length; i += chunkSize) {
277
- chunks.push(text.slice(i, i + chunkSize))
278
- }
279
- return chunks
280
- }
281
-
282
- /**
283
- * 传统方式:每次收到新内容都重新解析全部文本
284
- */
285
- function benchmarkTraditional(chunks: string[], iterations: number): BenchmarkResult {
286
- let totalTime = 0
287
- let totalCharsParsed = 0
288
- let parseCount = 0
289
-
290
- for (let iter = 0; iter < iterations; iter++) {
291
- let buffer = ''
292
-
293
- for (const chunk of chunks) {
294
- buffer += chunk
295
-
296
- const start = performance.now()
297
- fromMarkdown(buffer, {
298
- extensions: [gfm()],
299
- mdastExtensions: [gfmFromMarkdown()]
300
- })
301
- const end = performance.now()
302
-
303
- totalTime += (end - start)
304
- totalCharsParsed += buffer.length
305
- parseCount++
306
- }
307
- }
308
-
309
- return {
310
- name: 'Traditional (re-parse all)',
311
- totalTime,
312
- parseCount,
313
- avgTimePerParse: totalTime / parseCount,
314
- totalCharsParsed
315
- }
316
- }
317
-
318
- /**
319
- * Incremark 方式:增量解析
320
- */
321
- function benchmarkIncremental(chunks: string[], iterations: number): BenchmarkResult {
322
- let totalTime = 0
323
- let totalCharsParsed = 0
324
- let parseCount = 0
325
-
326
- for (let iter = 0; iter < iterations; iter++) {
327
- const parser = new IncremarkParser({ gfm: true })
328
-
329
- for (const chunk of chunks) {
330
- const start = performance.now()
331
- parser.append(chunk)
332
- const end = performance.now()
333
-
334
- totalTime += (end - start)
335
- totalCharsParsed += chunk.length
336
- parseCount++
337
- }
338
-
339
- const start = performance.now()
340
- parser.finalize()
341
- const end = performance.now()
342
- totalTime += (end - start)
343
- }
344
-
345
- return {
346
- name: 'Incremark (incremental)',
347
- totalTime,
348
- parseCount,
349
- avgTimePerParse: totalTime / parseCount,
350
- totalCharsParsed
351
- }
352
- }
353
-
354
- /**
355
- * 运行 benchmark
356
- */
357
- export function runBenchmark(options: {
358
- chunkSize?: number
359
- iterations?: number
360
- markdown?: string
361
- markdownLength?: number
362
- } = {}) {
363
- const {
364
- chunkSize = 10,
365
- iterations = 100,
366
- markdownLength
367
- } = options
368
-
369
- // 如果指定了长度,生成对应长度的 Markdown
370
- const markdown = markdownLength
371
- ? generateLongMarkdown(markdownLength)
372
- : (options.markdown || testMarkdown)
373
-
374
- const chunks = simulateStream(markdown, chunkSize)
375
-
376
- console.log('='.repeat(60))
377
- console.log('Incremark Benchmark')
378
- console.log('='.repeat(60))
379
- console.log(`Markdown length: ${markdown.length} chars`)
380
- console.log(`Chunk size: ${chunkSize} chars`)
381
- console.log(`Total chunks: ${chunks.length}`)
382
- console.log(`Iterations: ${iterations}`)
383
- console.log('='.repeat(60))
384
- console.log('')
385
-
386
- // 预热
387
- console.log('Warming up...')
388
- benchmarkTraditional(chunks, 5)
389
- benchmarkIncremental(chunks, 5)
390
- console.log('')
391
-
392
- // 正式测试
393
- console.log('Running benchmark...')
394
- console.log('')
395
-
396
- const traditional = benchmarkTraditional(chunks, iterations)
397
- const incremental = benchmarkIncremental(chunks, iterations)
398
-
399
- // 计算节省百分比
400
- const timeSaved = ((traditional.totalTime - incremental.totalTime) / traditional.totalTime * 100).toFixed(1)
401
- const charsSaved = ((traditional.totalCharsParsed - incremental.totalCharsParsed) / traditional.totalCharsParsed * 100).toFixed(1)
402
-
403
- console.log('Results:')
404
- console.log('-'.repeat(60))
405
- console.log('')
406
-
407
- console.log(`📊 ${traditional.name}`)
408
- console.log(` Total time: ${traditional.totalTime.toFixed(2)} ms`)
409
- console.log(` Parse count: ${traditional.parseCount}`)
410
- console.log(` Avg time per parse: ${traditional.avgTimePerParse.toFixed(4)} ms`)
411
- console.log(` Total chars parsed: ${traditional.totalCharsParsed.toLocaleString()}`)
412
- console.log('')
413
-
414
- console.log(`⚡ ${incremental.name}`)
415
- console.log(` Total time: ${incremental.totalTime.toFixed(2)} ms`)
416
- console.log(` Parse count: ${incremental.parseCount}`)
417
- console.log(` Avg time per parse: ${incremental.avgTimePerParse.toFixed(4)} ms`)
418
- console.log(` Total chars parsed: ${incremental.totalCharsParsed.toLocaleString()}`)
419
- console.log('')
420
-
421
- console.log('-'.repeat(60))
422
- console.log('')
423
- console.log(`🎯 Performance Improvement:`)
424
- console.log(` Time saved: ${timeSaved}%`)
425
- console.log(` Chars parsing saved: ${charsSaved}%`)
426
- console.log(` Speedup: ${(traditional.totalTime / incremental.totalTime).toFixed(2)}x faster`)
427
- console.log('')
428
- console.log('='.repeat(60))
429
-
430
- return {
431
- traditional,
432
- incremental,
433
- timeSaved: parseFloat(timeSaved),
434
- charsSaved: parseFloat(charsSaved),
435
- speedup: traditional.totalTime / incremental.totalTime
436
- }
437
- }
438
-
439
- // 如果直接运行此文件
440
- if (typeof process !== 'undefined' && process.argv[1]?.includes('benchmark')) {
441
- runBenchmark()
442
- }
443
-