@incremark/core 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/__tests__/footnote.test.ts +0 -214
- package/src/benchmark/index.ts +0 -443
- package/src/benchmark/run.ts +0 -93
- package/src/detector/index.test.ts +0 -150
- package/src/detector/index.ts +0 -330
- package/src/extensions/html-extension/index.test.ts +0 -409
- package/src/extensions/html-extension/index.ts +0 -792
- package/src/extensions/micromark-gfm-footnote-incremental.ts +0 -275
- package/src/extensions/micromark-reference-extension.ts +0 -724
- package/src/index.ts +0 -128
- package/src/parser/IncremarkParser.comprehensive.test.ts +0 -418
- package/src/parser/IncremarkParser.footnote.test.ts +0 -334
- package/src/parser/IncremarkParser.robustness.test.ts +0 -428
- package/src/parser/IncremarkParser.test.ts +0 -110
- package/src/parser/IncremarkParser.ts +0 -839
- package/src/parser/index.ts +0 -2
- package/src/transformer/BlockTransformer.ts +0 -640
- package/src/transformer/index.ts +0 -36
- package/src/transformer/plugins.ts +0 -113
- package/src/transformer/types.ts +0 -115
- package/src/transformer/utils.ts +0 -364
- package/src/types/index.ts +0 -183
- package/src/utils/index.ts +0 -53
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@incremark/core",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.3",
|
|
4
4
|
"description": "增量式 Markdown 解析器核心库",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
},
|
|
24
24
|
"files": [
|
|
25
25
|
"dist",
|
|
26
|
-
"src"
|
|
26
|
+
"src/transformer/styles.css"
|
|
27
27
|
],
|
|
28
28
|
"dependencies": {
|
|
29
29
|
"@types/mdast": "^4.0.0",
|
|
@@ -1,214 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* 脚注解析测试
|
|
3
|
-
*
|
|
4
|
-
* 测试增量解析场景下的脚注引用和定义
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
import { describe, it, expect } from 'vitest'
|
|
8
|
-
import { createIncremarkParser } from '../parser/IncremarkParser'
|
|
9
|
-
|
|
10
|
-
describe('Footnote Parsing', () => {
|
|
11
|
-
describe('Basic Footnote', () => {
|
|
12
|
-
it('should parse footnote reference before definition', () => {
|
|
13
|
-
const markdown = `这是一个简单的脚注[^1]。
|
|
14
|
-
|
|
15
|
-
[^1]: 这是第一个脚注的内容。`
|
|
16
|
-
|
|
17
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
18
|
-
const result = parser.render(markdown)
|
|
19
|
-
const ast = result.ast
|
|
20
|
-
|
|
21
|
-
// 检查是否有脚注引用
|
|
22
|
-
const paragraph = ast.children[0]
|
|
23
|
-
expect(paragraph.type).toBe('paragraph')
|
|
24
|
-
|
|
25
|
-
const hasFootnoteRef = paragraph.children?.some(
|
|
26
|
-
(node: any) => node.type === 'footnoteReference' && node.identifier === '1'
|
|
27
|
-
)
|
|
28
|
-
expect(hasFootnoteRef).toBe(true)
|
|
29
|
-
|
|
30
|
-
// 检查是否有脚注定义
|
|
31
|
-
const hasFootnoteDef = ast.children.some(
|
|
32
|
-
(node: any) => node.type === 'footnoteDefinition' && node.identifier === '1'
|
|
33
|
-
)
|
|
34
|
-
expect(hasFootnoteDef).toBe(true)
|
|
35
|
-
})
|
|
36
|
-
|
|
37
|
-
it('should parse multiple footnotes', () => {
|
|
38
|
-
const markdown = `第一个脚注[^1],第二个脚注[^2]。
|
|
39
|
-
|
|
40
|
-
[^1]: 第一个内容。
|
|
41
|
-
[^2]: 第二个内容。`
|
|
42
|
-
|
|
43
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
44
|
-
const result = parser.render(markdown)
|
|
45
|
-
const ast = result.ast
|
|
46
|
-
|
|
47
|
-
// 检查两个脚注引用
|
|
48
|
-
const paragraph = ast.children[0]
|
|
49
|
-
const footnoteRefs = paragraph.children?.filter(
|
|
50
|
-
(node: any) => node.type === 'footnoteReference'
|
|
51
|
-
)
|
|
52
|
-
expect(footnoteRefs).toHaveLength(2)
|
|
53
|
-
expect(footnoteRefs?.[0].identifier).toBe('1')
|
|
54
|
-
expect(footnoteRefs?.[1].identifier).toBe('2')
|
|
55
|
-
|
|
56
|
-
// 检查两个脚注定义
|
|
57
|
-
const footnoteDefs = ast.children.filter(
|
|
58
|
-
(node: any) => node.type === 'footnoteDefinition'
|
|
59
|
-
)
|
|
60
|
-
expect(footnoteDefs).toHaveLength(2)
|
|
61
|
-
expect(footnoteDefs[0].identifier).toBe('1')
|
|
62
|
-
expect(footnoteDefs[1].identifier).toBe('2')
|
|
63
|
-
})
|
|
64
|
-
})
|
|
65
|
-
|
|
66
|
-
describe('Multiline Footnote', () => {
|
|
67
|
-
it('should parse multiline footnote content', () => {
|
|
68
|
-
const markdown = `多行脚注[^long]。
|
|
69
|
-
|
|
70
|
-
[^long]: 第一段内容。
|
|
71
|
-
|
|
72
|
-
第二段内容(缩进)。`
|
|
73
|
-
|
|
74
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
75
|
-
const result = parser.render(markdown)
|
|
76
|
-
const ast = result.ast
|
|
77
|
-
|
|
78
|
-
// 检查脚注引用
|
|
79
|
-
const paragraph = ast.children[0]
|
|
80
|
-
const hasFootnoteRef = paragraph.children?.some(
|
|
81
|
-
(node: any) => node.type === 'footnoteReference' && node.identifier === 'long'
|
|
82
|
-
)
|
|
83
|
-
expect(hasFootnoteRef).toBe(true)
|
|
84
|
-
|
|
85
|
-
// 检查脚注定义
|
|
86
|
-
const footnoteDef = ast.children.find(
|
|
87
|
-
(node: any) => node.type === 'footnoteDefinition' && node.identifier === 'long'
|
|
88
|
-
) as any
|
|
89
|
-
expect(footnoteDef).toBeDefined()
|
|
90
|
-
|
|
91
|
-
// 检查脚注内容是否包含多个段落
|
|
92
|
-
expect(footnoteDef.children.length).toBeGreaterThan(1)
|
|
93
|
-
})
|
|
94
|
-
})
|
|
95
|
-
|
|
96
|
-
describe('Incremental Parsing', () => {
|
|
97
|
-
it('should handle footnote reference in pending block', () => {
|
|
98
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
99
|
-
|
|
100
|
-
// 第一次追加:只有引用
|
|
101
|
-
const update1 = parser.append('这是一个脚注[^1]。\n\n')
|
|
102
|
-
|
|
103
|
-
// 检查 pending blocks 中是否有脚注引用
|
|
104
|
-
const pendingParagraph = update1.pending[0]?.node
|
|
105
|
-
const hasPendingRef = pendingParagraph?.children?.some(
|
|
106
|
-
(node: any) => node.type === 'footnoteReference'
|
|
107
|
-
)
|
|
108
|
-
expect(hasPendingRef).toBe(true)
|
|
109
|
-
|
|
110
|
-
// 第二次追加:添加定义
|
|
111
|
-
const update2 = parser.append('[^1]: 脚注内容。')
|
|
112
|
-
parser.finalize()
|
|
113
|
-
|
|
114
|
-
// 检查最终 AST
|
|
115
|
-
const ast = parser.getAst()
|
|
116
|
-
const hasFootnoteDef = ast.children.some(
|
|
117
|
-
(node: any) => node.type === 'footnoteDefinition' && node.identifier === '1'
|
|
118
|
-
)
|
|
119
|
-
expect(hasFootnoteDef).toBe(true)
|
|
120
|
-
})
|
|
121
|
-
|
|
122
|
-
it('should handle definition before reference', () => {
|
|
123
|
-
const markdown = `[^1]: 脚注内容。
|
|
124
|
-
|
|
125
|
-
这是一个脚注[^1]。`
|
|
126
|
-
|
|
127
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
128
|
-
const result = parser.render(markdown)
|
|
129
|
-
const ast = result.ast
|
|
130
|
-
|
|
131
|
-
// 即使定义在前,引用也应该被正确解析
|
|
132
|
-
const paragraph = ast.children.find((node: any) => node.type === 'paragraph')
|
|
133
|
-
const hasFootnoteRef = paragraph?.children?.some(
|
|
134
|
-
(node: any) => node.type === 'footnoteReference' && node.identifier === '1'
|
|
135
|
-
)
|
|
136
|
-
expect(hasFootnoteRef).toBe(true)
|
|
137
|
-
|
|
138
|
-
const hasFootnoteDef = ast.children.some(
|
|
139
|
-
(node: any) => node.type === 'footnoteDefinition' && node.identifier === '1'
|
|
140
|
-
)
|
|
141
|
-
expect(hasFootnoteDef).toBe(true)
|
|
142
|
-
})
|
|
143
|
-
})
|
|
144
|
-
|
|
145
|
-
describe('Edge Cases', () => {
|
|
146
|
-
it('should handle footnote with special characters in identifier', () => {
|
|
147
|
-
const markdown = `脚注[^note-1]。
|
|
148
|
-
|
|
149
|
-
[^note-1]: 内容。`
|
|
150
|
-
|
|
151
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
152
|
-
const result = parser.render(markdown)
|
|
153
|
-
const ast = result.ast
|
|
154
|
-
|
|
155
|
-
const paragraph = ast.children[0]
|
|
156
|
-
const hasFootnoteRef = paragraph.children?.some(
|
|
157
|
-
(node: any) => node.type === 'footnoteReference' && node.identifier === 'note-1'
|
|
158
|
-
)
|
|
159
|
-
expect(hasFootnoteRef).toBe(true)
|
|
160
|
-
})
|
|
161
|
-
|
|
162
|
-
it('should handle footnote with markdown in content', () => {
|
|
163
|
-
const markdown = `脚注[^complex]。
|
|
164
|
-
|
|
165
|
-
[^complex]: 包含 **粗体** 和 *斜体*。`
|
|
166
|
-
|
|
167
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
168
|
-
const result = parser.render(markdown)
|
|
169
|
-
const ast = result.ast
|
|
170
|
-
|
|
171
|
-
const footnoteDef = ast.children.find(
|
|
172
|
-
(node: any) => node.type === 'footnoteDefinition' && node.identifier === 'complex'
|
|
173
|
-
) as any
|
|
174
|
-
expect(footnoteDef).toBeDefined()
|
|
175
|
-
|
|
176
|
-
// 检查脚注内容是否包含格式化文本
|
|
177
|
-
const paragraph = footnoteDef.children[0]
|
|
178
|
-
const hasStrong = paragraph.children?.some((node: any) => node.type === 'strong')
|
|
179
|
-
const hasEmphasis = paragraph.children?.some((node: any) => node.type === 'emphasis')
|
|
180
|
-
expect(hasStrong || hasEmphasis).toBe(true)
|
|
181
|
-
})
|
|
182
|
-
|
|
183
|
-
it('should not parse invalid footnote syntax', () => {
|
|
184
|
-
const markdown = `这不是脚注[^ 1]。`
|
|
185
|
-
|
|
186
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
187
|
-
const result = parser.render(markdown)
|
|
188
|
-
const ast = result.ast
|
|
189
|
-
|
|
190
|
-
// 空格会导致解析失败,应该被当作普通文本
|
|
191
|
-
const paragraph = ast.children[0]
|
|
192
|
-
const hasFootnoteRef = paragraph.children?.some(
|
|
193
|
-
(node: any) => node.type === 'footnoteReference'
|
|
194
|
-
)
|
|
195
|
-
expect(hasFootnoteRef).toBe(false)
|
|
196
|
-
})
|
|
197
|
-
})
|
|
198
|
-
|
|
199
|
-
describe('Footnote Reference Order', () => {
|
|
200
|
-
it('should track footnote reference order', () => {
|
|
201
|
-
const markdown = `第二个[^2]出现在第一个[^1]之前。
|
|
202
|
-
|
|
203
|
-
[^1]: 第一个定义。
|
|
204
|
-
[^2]: 第二个定义。`
|
|
205
|
-
|
|
206
|
-
const parser = createIncremarkParser({ gfm: true })
|
|
207
|
-
const result = parser.render(markdown)
|
|
208
|
-
|
|
209
|
-
// 检查引用顺序
|
|
210
|
-
expect(result.footnoteReferenceOrder).toEqual(['2', '1'])
|
|
211
|
-
})
|
|
212
|
-
})
|
|
213
|
-
})
|
|
214
|
-
|
package/src/benchmark/index.ts
DELETED
|
@@ -1,443 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Incremark vs Traditional Parser Benchmark
|
|
3
|
-
*
|
|
4
|
-
* 对比增量解析和传统解析(每次重新解析全部内容)的性能差异
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
import { IncremarkParser } from '../parser/IncremarkParser'
|
|
8
|
-
import { fromMarkdown } from 'mdast-util-from-markdown'
|
|
9
|
-
import { gfm } from 'micromark-extension-gfm'
|
|
10
|
-
import { gfmFromMarkdown } from 'mdast-util-gfm'
|
|
11
|
-
|
|
12
|
-
// 短文本测试(~800 字符)
|
|
13
|
-
const shortMarkdown = `
|
|
14
|
-
# Hello World
|
|
15
|
-
|
|
16
|
-
This is a paragraph with **bold** and *italic* text.
|
|
17
|
-
|
|
18
|
-
## Code Example
|
|
19
|
-
|
|
20
|
-
\`\`\`javascript
|
|
21
|
-
function hello() {
|
|
22
|
-
console.log('Hello, World!');
|
|
23
|
-
return {
|
|
24
|
-
name: 'test',
|
|
25
|
-
value: 42
|
|
26
|
-
};
|
|
27
|
-
}
|
|
28
|
-
\`\`\`
|
|
29
|
-
|
|
30
|
-
## List Example
|
|
31
|
-
|
|
32
|
-
- Item 1
|
|
33
|
-
- Item 2
|
|
34
|
-
- Nested item 2.1
|
|
35
|
-
- Nested item 2.2
|
|
36
|
-
- Item 3
|
|
37
|
-
|
|
38
|
-
## Table Example
|
|
39
|
-
|
|
40
|
-
| Name | Age | City |
|
|
41
|
-
|------|-----|------|
|
|
42
|
-
| Alice | 25 | NYC |
|
|
43
|
-
| Bob | 30 | LA |
|
|
44
|
-
|
|
45
|
-
## Blockquote
|
|
46
|
-
|
|
47
|
-
> This is a quote
|
|
48
|
-
> with multiple lines
|
|
49
|
-
> and **formatted** text
|
|
50
|
-
|
|
51
|
-
## More Content
|
|
52
|
-
|
|
53
|
-
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
|
54
|
-
|
|
55
|
-
### Subsection
|
|
56
|
-
|
|
57
|
-
More text here with [links](https://example.com) and \`inline code\`.
|
|
58
|
-
|
|
59
|
-
1. Ordered item 1
|
|
60
|
-
2. Ordered item 2
|
|
61
|
-
3. Ordered item 3
|
|
62
|
-
|
|
63
|
-
---
|
|
64
|
-
|
|
65
|
-
The end.
|
|
66
|
-
`
|
|
67
|
-
|
|
68
|
-
// 生成长文本(模拟真实 AI 输出)
|
|
69
|
-
function generateLongMarkdown(targetLength: number): string {
|
|
70
|
-
const sections = [
|
|
71
|
-
`
|
|
72
|
-
# Introduction to Machine Learning
|
|
73
|
-
|
|
74
|
-
Machine learning is a subset of artificial intelligence (AI) that provides systems the ability to automatically learn and improve from experience without being explicitly programmed.
|
|
75
|
-
|
|
76
|
-
## Key Concepts
|
|
77
|
-
|
|
78
|
-
### Supervised Learning
|
|
79
|
-
|
|
80
|
-
In supervised learning, the algorithm learns from labeled training data, and makes predictions based on that data. Common algorithms include:
|
|
81
|
-
|
|
82
|
-
- **Linear Regression** - For predicting continuous values
|
|
83
|
-
- **Logistic Regression** - For classification problems
|
|
84
|
-
- **Decision Trees** - For both classification and regression
|
|
85
|
-
- **Random Forest** - Ensemble method using multiple decision trees
|
|
86
|
-
- **Support Vector Machines** - For classification with clear margins
|
|
87
|
-
|
|
88
|
-
\`\`\`python
|
|
89
|
-
from sklearn.model_selection import train_test_split
|
|
90
|
-
from sklearn.ensemble import RandomForestClassifier
|
|
91
|
-
|
|
92
|
-
# Split the data
|
|
93
|
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
|
94
|
-
|
|
95
|
-
# Train the model
|
|
96
|
-
model = RandomForestClassifier(n_estimators=100)
|
|
97
|
-
model.fit(X_train, y_train)
|
|
98
|
-
|
|
99
|
-
# Make predictions
|
|
100
|
-
predictions = model.predict(X_test)
|
|
101
|
-
\`\`\`
|
|
102
|
-
|
|
103
|
-
### Unsupervised Learning
|
|
104
|
-
|
|
105
|
-
Unsupervised learning deals with unlabeled data. The algorithm tries to find patterns and relationships in the data.
|
|
106
|
-
|
|
107
|
-
| Algorithm | Use Case | Complexity |
|
|
108
|
-
|-----------|----------|------------|
|
|
109
|
-
| K-Means | Clustering | O(n*k*i) |
|
|
110
|
-
| DBSCAN | Density clustering | O(n log n) |
|
|
111
|
-
| PCA | Dimensionality reduction | O(n*d²) |
|
|
112
|
-
| t-SNE | Visualization | O(n²) |
|
|
113
|
-
|
|
114
|
-
> "The goal of unsupervised learning is to discover hidden patterns or data groupings without the need for human intervention." - Andrew Ng
|
|
115
|
-
|
|
116
|
-
`,
|
|
117
|
-
`
|
|
118
|
-
## Deep Learning
|
|
119
|
-
|
|
120
|
-
Deep learning is a subset of machine learning that uses neural networks with many layers.
|
|
121
|
-
|
|
122
|
-
### Neural Network Architecture
|
|
123
|
-
|
|
124
|
-
\`\`\`
|
|
125
|
-
Input Layer → Hidden Layer 1 → Hidden Layer 2 → ... → Output Layer
|
|
126
|
-
↓ ↓ ↓ ↓
|
|
127
|
-
Features Activations Activations Predictions
|
|
128
|
-
\`\`\`
|
|
129
|
-
|
|
130
|
-
### Common Activation Functions
|
|
131
|
-
|
|
132
|
-
1. **ReLU (Rectified Linear Unit)**
|
|
133
|
-
- Formula: \`f(x) = max(0, x)\`
|
|
134
|
-
- Most commonly used in hidden layers
|
|
135
|
-
|
|
136
|
-
2. **Sigmoid**
|
|
137
|
-
- Formula: \`f(x) = 1 / (1 + e^(-x))\`
|
|
138
|
-
- Used for binary classification
|
|
139
|
-
|
|
140
|
-
3. **Softmax**
|
|
141
|
-
- Used for multi-class classification
|
|
142
|
-
- Outputs probability distribution
|
|
143
|
-
|
|
144
|
-
\`\`\`python
|
|
145
|
-
import torch
|
|
146
|
-
import torch.nn as nn
|
|
147
|
-
|
|
148
|
-
class NeuralNetwork(nn.Module):
|
|
149
|
-
def __init__(self, input_size, hidden_size, num_classes):
|
|
150
|
-
super(NeuralNetwork, self).__init__()
|
|
151
|
-
self.layer1 = nn.Linear(input_size, hidden_size)
|
|
152
|
-
self.relu = nn.ReLU()
|
|
153
|
-
self.layer2 = nn.Linear(hidden_size, num_classes)
|
|
154
|
-
|
|
155
|
-
def forward(self, x):
|
|
156
|
-
out = self.layer1(x)
|
|
157
|
-
out = self.relu(out)
|
|
158
|
-
out = self.layer2(out)
|
|
159
|
-
return out
|
|
160
|
-
\`\`\`
|
|
161
|
-
|
|
162
|
-
`,
|
|
163
|
-
`
|
|
164
|
-
## Natural Language Processing
|
|
165
|
-
|
|
166
|
-
NLP is a field of AI that focuses on the interaction between computers and humans through natural language.
|
|
167
|
-
|
|
168
|
-
### Key Tasks
|
|
169
|
-
|
|
170
|
-
- **Text Classification** - Categorizing text into predefined categories
|
|
171
|
-
- **Named Entity Recognition** - Identifying entities like names, locations, organizations
|
|
172
|
-
- **Sentiment Analysis** - Determining the emotional tone of text
|
|
173
|
-
- **Machine Translation** - Translating text from one language to another
|
|
174
|
-
- **Question Answering** - Answering questions based on context
|
|
175
|
-
|
|
176
|
-
### Transformer Architecture
|
|
177
|
-
|
|
178
|
-
The transformer architecture revolutionized NLP with the introduction of self-attention mechanisms.
|
|
179
|
-
|
|
180
|
-
\`\`\`
|
|
181
|
-
┌─────────────────────────────────────┐
|
|
182
|
-
│ Transformer │
|
|
183
|
-
├─────────────────────────────────────┤
|
|
184
|
-
│ ┌─────────────┐ ┌─────────────┐ │
|
|
185
|
-
│ │ Encoder │ │ Decoder │ │
|
|
186
|
-
│ │ │ │ │ │
|
|
187
|
-
│ │ Self-Attn │ │ Self-Attn │ │
|
|
188
|
-
│ │ Feed-Forward│ │ Cross-Attn │ │
|
|
189
|
-
│ │ │ │ Feed-Forward│ │
|
|
190
|
-
│ └─────────────┘ └─────────────┘ │
|
|
191
|
-
└─────────────────────────────────────┘
|
|
192
|
-
\`\`\`
|
|
193
|
-
|
|
194
|
-
> Transformers have become the foundation for large language models like GPT, BERT, and Claude.
|
|
195
|
-
|
|
196
|
-
`,
|
|
197
|
-
`
|
|
198
|
-
## Best Practices
|
|
199
|
-
|
|
200
|
-
### Data Preprocessing
|
|
201
|
-
|
|
202
|
-
1. Handle missing values appropriately
|
|
203
|
-
2. Normalize or standardize numerical features
|
|
204
|
-
3. Encode categorical variables
|
|
205
|
-
4. Split data into train/validation/test sets
|
|
206
|
-
5. Apply data augmentation when appropriate
|
|
207
|
-
|
|
208
|
-
### Model Evaluation
|
|
209
|
-
|
|
210
|
-
| Metric | Formula | Use Case |
|
|
211
|
-
|--------|---------|----------|
|
|
212
|
-
| Accuracy | (TP+TN)/(TP+TN+FP+FN) | Balanced classes |
|
|
213
|
-
| Precision | TP/(TP+FP) | When FP is costly |
|
|
214
|
-
| Recall | TP/(TP+FN) | When FN is costly |
|
|
215
|
-
| F1 Score | 2*(P*R)/(P+R) | Imbalanced classes |
|
|
216
|
-
| AUC-ROC | Area under ROC curve | Binary classification |
|
|
217
|
-
|
|
218
|
-
### Hyperparameter Tuning
|
|
219
|
-
|
|
220
|
-
\`\`\`python
|
|
221
|
-
from sklearn.model_selection import GridSearchCV
|
|
222
|
-
|
|
223
|
-
param_grid = {
|
|
224
|
-
'n_estimators': [100, 200, 300],
|
|
225
|
-
'max_depth': [10, 20, 30, None],
|
|
226
|
-
'min_samples_split': [2, 5, 10],
|
|
227
|
-
'min_samples_leaf': [1, 2, 4]
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
grid_search = GridSearchCV(
|
|
231
|
-
estimator=RandomForestClassifier(),
|
|
232
|
-
param_grid=param_grid,
|
|
233
|
-
cv=5,
|
|
234
|
-
n_jobs=-1,
|
|
235
|
-
verbose=2
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
grid_search.fit(X_train, y_train)
|
|
239
|
-
print(f"Best parameters: {grid_search.best_params_}")
|
|
240
|
-
\`\`\`
|
|
241
|
-
|
|
242
|
-
---
|
|
243
|
-
|
|
244
|
-
This concludes our overview of machine learning fundamentals.
|
|
245
|
-
|
|
246
|
-
`
|
|
247
|
-
]
|
|
248
|
-
|
|
249
|
-
let result = ''
|
|
250
|
-
let sectionIndex = 0
|
|
251
|
-
|
|
252
|
-
while (result.length < targetLength) {
|
|
253
|
-
result += sections[sectionIndex % sections.length]
|
|
254
|
-
sectionIndex++
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
return result.slice(0, targetLength)
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
// 默认测试用的 Markdown 内容
|
|
261
|
-
const testMarkdown = shortMarkdown
|
|
262
|
-
|
|
263
|
-
interface BenchmarkResult {
|
|
264
|
-
name: string
|
|
265
|
-
totalTime: number
|
|
266
|
-
parseCount: number
|
|
267
|
-
avgTimePerParse: number
|
|
268
|
-
totalCharsParsed: number
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
/**
|
|
272
|
-
* 模拟流式输入,将文本按 chunk 大小分割
|
|
273
|
-
*/
|
|
274
|
-
function simulateStream(text: string, chunkSize: number): string[] {
|
|
275
|
-
const chunks: string[] = []
|
|
276
|
-
for (let i = 0; i < text.length; i += chunkSize) {
|
|
277
|
-
chunks.push(text.slice(i, i + chunkSize))
|
|
278
|
-
}
|
|
279
|
-
return chunks
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
/**
|
|
283
|
-
* 传统方式:每次收到新内容都重新解析全部文本
|
|
284
|
-
*/
|
|
285
|
-
function benchmarkTraditional(chunks: string[], iterations: number): BenchmarkResult {
|
|
286
|
-
let totalTime = 0
|
|
287
|
-
let totalCharsParsed = 0
|
|
288
|
-
let parseCount = 0
|
|
289
|
-
|
|
290
|
-
for (let iter = 0; iter < iterations; iter++) {
|
|
291
|
-
let buffer = ''
|
|
292
|
-
|
|
293
|
-
for (const chunk of chunks) {
|
|
294
|
-
buffer += chunk
|
|
295
|
-
|
|
296
|
-
const start = performance.now()
|
|
297
|
-
fromMarkdown(buffer, {
|
|
298
|
-
extensions: [gfm()],
|
|
299
|
-
mdastExtensions: [gfmFromMarkdown()]
|
|
300
|
-
})
|
|
301
|
-
const end = performance.now()
|
|
302
|
-
|
|
303
|
-
totalTime += (end - start)
|
|
304
|
-
totalCharsParsed += buffer.length
|
|
305
|
-
parseCount++
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
return {
|
|
310
|
-
name: 'Traditional (re-parse all)',
|
|
311
|
-
totalTime,
|
|
312
|
-
parseCount,
|
|
313
|
-
avgTimePerParse: totalTime / parseCount,
|
|
314
|
-
totalCharsParsed
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
/**
|
|
319
|
-
* Incremark 方式:增量解析
|
|
320
|
-
*/
|
|
321
|
-
function benchmarkIncremental(chunks: string[], iterations: number): BenchmarkResult {
|
|
322
|
-
let totalTime = 0
|
|
323
|
-
let totalCharsParsed = 0
|
|
324
|
-
let parseCount = 0
|
|
325
|
-
|
|
326
|
-
for (let iter = 0; iter < iterations; iter++) {
|
|
327
|
-
const parser = new IncremarkParser({ gfm: true })
|
|
328
|
-
|
|
329
|
-
for (const chunk of chunks) {
|
|
330
|
-
const start = performance.now()
|
|
331
|
-
parser.append(chunk)
|
|
332
|
-
const end = performance.now()
|
|
333
|
-
|
|
334
|
-
totalTime += (end - start)
|
|
335
|
-
totalCharsParsed += chunk.length
|
|
336
|
-
parseCount++
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
const start = performance.now()
|
|
340
|
-
parser.finalize()
|
|
341
|
-
const end = performance.now()
|
|
342
|
-
totalTime += (end - start)
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
return {
|
|
346
|
-
name: 'Incremark (incremental)',
|
|
347
|
-
totalTime,
|
|
348
|
-
parseCount,
|
|
349
|
-
avgTimePerParse: totalTime / parseCount,
|
|
350
|
-
totalCharsParsed
|
|
351
|
-
}
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
/**
|
|
355
|
-
* 运行 benchmark
|
|
356
|
-
*/
|
|
357
|
-
export function runBenchmark(options: {
|
|
358
|
-
chunkSize?: number
|
|
359
|
-
iterations?: number
|
|
360
|
-
markdown?: string
|
|
361
|
-
markdownLength?: number
|
|
362
|
-
} = {}) {
|
|
363
|
-
const {
|
|
364
|
-
chunkSize = 10,
|
|
365
|
-
iterations = 100,
|
|
366
|
-
markdownLength
|
|
367
|
-
} = options
|
|
368
|
-
|
|
369
|
-
// 如果指定了长度,生成对应长度的 Markdown
|
|
370
|
-
const markdown = markdownLength
|
|
371
|
-
? generateLongMarkdown(markdownLength)
|
|
372
|
-
: (options.markdown || testMarkdown)
|
|
373
|
-
|
|
374
|
-
const chunks = simulateStream(markdown, chunkSize)
|
|
375
|
-
|
|
376
|
-
console.log('='.repeat(60))
|
|
377
|
-
console.log('Incremark Benchmark')
|
|
378
|
-
console.log('='.repeat(60))
|
|
379
|
-
console.log(`Markdown length: ${markdown.length} chars`)
|
|
380
|
-
console.log(`Chunk size: ${chunkSize} chars`)
|
|
381
|
-
console.log(`Total chunks: ${chunks.length}`)
|
|
382
|
-
console.log(`Iterations: ${iterations}`)
|
|
383
|
-
console.log('='.repeat(60))
|
|
384
|
-
console.log('')
|
|
385
|
-
|
|
386
|
-
// 预热
|
|
387
|
-
console.log('Warming up...')
|
|
388
|
-
benchmarkTraditional(chunks, 5)
|
|
389
|
-
benchmarkIncremental(chunks, 5)
|
|
390
|
-
console.log('')
|
|
391
|
-
|
|
392
|
-
// 正式测试
|
|
393
|
-
console.log('Running benchmark...')
|
|
394
|
-
console.log('')
|
|
395
|
-
|
|
396
|
-
const traditional = benchmarkTraditional(chunks, iterations)
|
|
397
|
-
const incremental = benchmarkIncremental(chunks, iterations)
|
|
398
|
-
|
|
399
|
-
// 计算节省百分比
|
|
400
|
-
const timeSaved = ((traditional.totalTime - incremental.totalTime) / traditional.totalTime * 100).toFixed(1)
|
|
401
|
-
const charsSaved = ((traditional.totalCharsParsed - incremental.totalCharsParsed) / traditional.totalCharsParsed * 100).toFixed(1)
|
|
402
|
-
|
|
403
|
-
console.log('Results:')
|
|
404
|
-
console.log('-'.repeat(60))
|
|
405
|
-
console.log('')
|
|
406
|
-
|
|
407
|
-
console.log(`📊 ${traditional.name}`)
|
|
408
|
-
console.log(` Total time: ${traditional.totalTime.toFixed(2)} ms`)
|
|
409
|
-
console.log(` Parse count: ${traditional.parseCount}`)
|
|
410
|
-
console.log(` Avg time per parse: ${traditional.avgTimePerParse.toFixed(4)} ms`)
|
|
411
|
-
console.log(` Total chars parsed: ${traditional.totalCharsParsed.toLocaleString()}`)
|
|
412
|
-
console.log('')
|
|
413
|
-
|
|
414
|
-
console.log(`⚡ ${incremental.name}`)
|
|
415
|
-
console.log(` Total time: ${incremental.totalTime.toFixed(2)} ms`)
|
|
416
|
-
console.log(` Parse count: ${incremental.parseCount}`)
|
|
417
|
-
console.log(` Avg time per parse: ${incremental.avgTimePerParse.toFixed(4)} ms`)
|
|
418
|
-
console.log(` Total chars parsed: ${incremental.totalCharsParsed.toLocaleString()}`)
|
|
419
|
-
console.log('')
|
|
420
|
-
|
|
421
|
-
console.log('-'.repeat(60))
|
|
422
|
-
console.log('')
|
|
423
|
-
console.log(`🎯 Performance Improvement:`)
|
|
424
|
-
console.log(` Time saved: ${timeSaved}%`)
|
|
425
|
-
console.log(` Chars parsing saved: ${charsSaved}%`)
|
|
426
|
-
console.log(` Speedup: ${(traditional.totalTime / incremental.totalTime).toFixed(2)}x faster`)
|
|
427
|
-
console.log('')
|
|
428
|
-
console.log('='.repeat(60))
|
|
429
|
-
|
|
430
|
-
return {
|
|
431
|
-
traditional,
|
|
432
|
-
incremental,
|
|
433
|
-
timeSaved: parseFloat(timeSaved),
|
|
434
|
-
charsSaved: parseFloat(charsSaved),
|
|
435
|
-
speedup: traditional.totalTime / incremental.totalTime
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
// 如果直接运行此文件
|
|
440
|
-
if (typeof process !== 'undefined' && process.argv[1]?.includes('benchmark')) {
|
|
441
|
-
runBenchmark()
|
|
442
|
-
}
|
|
443
|
-
|