@yozora/tokenizer-html-block 2.0.4 → 2.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/{index.js → index.cjs} +5 -6
- package/lib/esm/{index.js → index.mjs} +4 -5
- package/lib/types/index.d.ts +5 -5
- package/package.json +18 -14
- package/src/conditions/c1.ts +79 -0
- package/src/conditions/c2.ts +55 -0
- package/src/conditions/c3.ts +48 -0
- package/src/conditions/c4.ts +48 -0
- package/src/conditions/c5.ts +59 -0
- package/src/conditions/c6.ts +109 -0
- package/src/conditions/c7.ts +54 -0
- package/src/index.ts +11 -0
- package/src/match.ts +231 -0
- package/src/parse.ts +18 -0
- package/src/tokenizer.ts +32 -0
- package/src/types.ts +74 -0
- package/src/util/eat-html-attribute.ts +170 -0
- package/src/util/eat-html-tagname.ts +27 -0
|
@@ -496,19 +496,18 @@ const uniqueName = '@yozora/tokenizer-html-block';
|
|
|
496
496
|
|
|
497
497
|
class HtmlBlockTokenizer extends coreTokenizer.BaseBlockTokenizer {
|
|
498
498
|
constructor(props = {}) {
|
|
499
|
-
var _a, _b;
|
|
500
499
|
super({
|
|
501
|
-
name:
|
|
502
|
-
priority:
|
|
500
|
+
name: props.name ?? uniqueName,
|
|
501
|
+
priority: props.priority ?? coreTokenizer.TokenizerPriority.ATOMIC,
|
|
503
502
|
});
|
|
504
|
-
this.match = match;
|
|
505
|
-
this.parse = parse;
|
|
506
503
|
}
|
|
504
|
+
match = match;
|
|
505
|
+
parse = parse;
|
|
507
506
|
}
|
|
508
507
|
|
|
509
508
|
exports.HtmlBlockTokenizer = HtmlBlockTokenizer;
|
|
510
509
|
exports.HtmlBlockTokenizerName = uniqueName;
|
|
511
|
-
exports
|
|
510
|
+
exports.default = HtmlBlockTokenizer;
|
|
512
511
|
exports.eatHTMLAttribute = eatHTMLAttribute;
|
|
513
512
|
exports.eatHTMLTagName = eatHTMLTagName;
|
|
514
513
|
exports.htmlBlockMatch = match;
|
|
@@ -492,14 +492,13 @@ const uniqueName = '@yozora/tokenizer-html-block';
|
|
|
492
492
|
|
|
493
493
|
class HtmlBlockTokenizer extends BaseBlockTokenizer {
|
|
494
494
|
constructor(props = {}) {
|
|
495
|
-
var _a, _b;
|
|
496
495
|
super({
|
|
497
|
-
name:
|
|
498
|
-
priority:
|
|
496
|
+
name: props.name ?? uniqueName,
|
|
497
|
+
priority: props.priority ?? TokenizerPriority.ATOMIC,
|
|
499
498
|
});
|
|
500
|
-
this.match = match;
|
|
501
|
-
this.parse = parse;
|
|
502
499
|
}
|
|
500
|
+
match = match;
|
|
501
|
+
parse = parse;
|
|
503
502
|
}
|
|
504
503
|
|
|
505
504
|
export { HtmlBlockTokenizer, uniqueName as HtmlBlockTokenizerName, HtmlBlockTokenizer as default, eatHTMLAttribute, eatHTMLTagName, match as htmlBlockMatch, parse as htmlBlockParse };
|
package/lib/types/index.d.ts
CHANGED
|
@@ -37,10 +37,10 @@ declare function eatHTMLAttribute(nodePoints: ReadonlyArray<INodePoint>, startIn
|
|
|
37
37
|
*/
|
|
38
38
|
declare function eatHTMLTagName(nodePoints: ReadonlyArray<INodePoint>, startIndex: number, endIndex: number): number | null;
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
|
|
40
|
+
type T = HtmlType;
|
|
41
|
+
type INode = Html;
|
|
42
42
|
declare const uniqueName = "@yozora/tokenizer-html-block";
|
|
43
|
-
|
|
43
|
+
type HtmlBlockConditionType = 1 | 2 | 3 | 4 | 5 | 6 | 7;
|
|
44
44
|
/**
|
|
45
45
|
* Middle state during the whole match and parse phase.
|
|
46
46
|
*/
|
|
@@ -97,8 +97,8 @@ interface IToken extends IPartialYastBlockToken<T> {
|
|
|
97
97
|
*/
|
|
98
98
|
lines: Array<Readonly<IPhrasingContentLine>>;
|
|
99
99
|
}
|
|
100
|
-
|
|
101
|
-
|
|
100
|
+
type IThis = ITokenizer;
|
|
101
|
+
type ITokenizerProps = Partial<IBaseBlockTokenizerProps>;
|
|
102
102
|
|
|
103
103
|
/**
|
|
104
104
|
* An HTML block is a group of lines that is treated as raw HTML (and will not
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yozora/tokenizer-html-block",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.5",
|
|
4
4
|
"author": {
|
|
5
5
|
"name": "guanghechen",
|
|
6
6
|
"url": "https://github.com/guanghechen/"
|
|
@@ -11,33 +11,37 @@
|
|
|
11
11
|
"directory": "tokenizers/html-block"
|
|
12
12
|
},
|
|
13
13
|
"homepage": "https://github.com/yozorajs/yozora/tree/release-2.x.x/tokenizers/html-block",
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
|
|
17
|
-
|
|
14
|
+
"type": "module",
|
|
15
|
+
"exports": {
|
|
16
|
+
"types": "./lib/types/index.d.ts",
|
|
17
|
+
"import": "./lib/esm/index.mjs",
|
|
18
|
+
"require": "./lib/cjs/index.cjs"
|
|
19
|
+
},
|
|
20
|
+
"source": "./src/index.ts",
|
|
21
|
+
"types": "./lib/types/index.d.ts",
|
|
22
|
+
"main": "./lib/cjs/index.cjs",
|
|
23
|
+
"module": "./lib/esm/index.mjs",
|
|
18
24
|
"license": "MIT",
|
|
19
25
|
"engines": {
|
|
20
26
|
"node": ">= 16.0.0"
|
|
21
27
|
},
|
|
22
28
|
"files": [
|
|
23
29
|
"lib/",
|
|
24
|
-
"
|
|
25
|
-
"!lib/**/*.d.ts.map",
|
|
30
|
+
"src/",
|
|
26
31
|
"package.json",
|
|
27
32
|
"CHANGELOG.md",
|
|
28
33
|
"LICENSE",
|
|
29
34
|
"README.md"
|
|
30
35
|
],
|
|
31
36
|
"scripts": {
|
|
32
|
-
"build": "cross-env NODE_ENV=production rollup -c ../../rollup.config.
|
|
33
|
-
"prebuild": "rimraf lib/",
|
|
37
|
+
"build": "rimraf lib/ && cross-env NODE_ENV=production rollup -c ../../rollup.config.mjs",
|
|
34
38
|
"prepublishOnly": "cross-env ROLLUP_SHOULD_SOURCEMAP=false yarn build",
|
|
35
|
-
"test": "cross-env TS_NODE_FILES=true jest --config ../../jest.config.
|
|
39
|
+
"test": "cross-env TS_NODE_FILES=true NODE_OPTIONS=--experimental-vm-modules jest --config ../../jest.config.mjs --rootDir ."
|
|
36
40
|
},
|
|
37
41
|
"dependencies": {
|
|
38
|
-
"@yozora/ast": "^2.0.
|
|
39
|
-
"@yozora/character": "^2.0.
|
|
40
|
-
"@yozora/core-tokenizer": "^2.0.
|
|
42
|
+
"@yozora/ast": "^2.0.5",
|
|
43
|
+
"@yozora/character": "^2.0.5",
|
|
44
|
+
"@yozora/core-tokenizer": "^2.0.5"
|
|
41
45
|
},
|
|
42
|
-
"gitHead": "
|
|
46
|
+
"gitHead": "7ba3bab49fe65cf2f57082c0503af73da9356cf0"
|
|
43
47
|
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import type { INodePoint } from '@yozora/character'
|
|
2
|
+
import { AsciiCodePoint, calcStringFromNodePoints, isWhitespaceCharacter } from '@yozora/character'
|
|
3
|
+
import { eatHTMLTagName } from '../util/eat-html-tagname'
|
|
4
|
+
|
|
5
|
+
const includedTags = ['pre', 'script', 'style']
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Eat block html start condition 1:
|
|
9
|
+
*
|
|
10
|
+
* line begins with the string `<script`, `<pre`, or
|
|
11
|
+
* `<style` (case-insensitive), followed by whitespace, the string `>`,
|
|
12
|
+
* or the end of the line.
|
|
13
|
+
*
|
|
14
|
+
* @param nodePoints
|
|
15
|
+
* @param startIndex
|
|
16
|
+
* @param endIndex
|
|
17
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
18
|
+
*/
|
|
19
|
+
export function eatStartCondition1(
|
|
20
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
21
|
+
startIndex: number,
|
|
22
|
+
endIndex: number,
|
|
23
|
+
tagName: string,
|
|
24
|
+
): number | null {
|
|
25
|
+
if (!includedTags.includes(tagName)) return null
|
|
26
|
+
if (startIndex >= endIndex) return endIndex
|
|
27
|
+
|
|
28
|
+
const c = nodePoints[startIndex].codePoint
|
|
29
|
+
if (isWhitespaceCharacter(c) || c === AsciiCodePoint.CLOSE_ANGLE) {
|
|
30
|
+
return startIndex + 1
|
|
31
|
+
}
|
|
32
|
+
return null
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Eat block html end condition 1:
|
|
37
|
+
*
|
|
38
|
+
* line contains an end tag `</script>`, `</pre>`,
|
|
39
|
+
* or `</style>` (case-insensitive; it need not match the start tag).
|
|
40
|
+
*
|
|
41
|
+
* @param nodePoints
|
|
42
|
+
* @param startIndex
|
|
43
|
+
* @param endIndex
|
|
44
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
45
|
+
*/
|
|
46
|
+
export function eatEndCondition1(
|
|
47
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
48
|
+
startIndex: number,
|
|
49
|
+
endIndex: number,
|
|
50
|
+
): number | null {
|
|
51
|
+
for (let i = startIndex; i < endIndex; ++i) {
|
|
52
|
+
if (
|
|
53
|
+
nodePoints[i].codePoint === AsciiCodePoint.OPEN_ANGLE &&
|
|
54
|
+
i + 3 < endIndex &&
|
|
55
|
+
nodePoints[i + 1].codePoint === AsciiCodePoint.SLASH
|
|
56
|
+
) {
|
|
57
|
+
const tagNameStartIndex = i + 2
|
|
58
|
+
const tagNameEndIndex = eatHTMLTagName(nodePoints, tagNameStartIndex, endIndex)
|
|
59
|
+
if (
|
|
60
|
+
tagNameEndIndex == null ||
|
|
61
|
+
tagNameEndIndex >= endIndex ||
|
|
62
|
+
nodePoints[tagNameEndIndex].codePoint !== AsciiCodePoint.CLOSE_ANGLE
|
|
63
|
+
) {
|
|
64
|
+
i += 1
|
|
65
|
+
continue
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const rawTagName = calcStringFromNodePoints(
|
|
69
|
+
nodePoints,
|
|
70
|
+
tagNameStartIndex,
|
|
71
|
+
tagNameEndIndex,
|
|
72
|
+
true,
|
|
73
|
+
)
|
|
74
|
+
const tagName = rawTagName.toLowerCase()
|
|
75
|
+
if (includedTags.includes(tagName)) return tagNameEndIndex
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return null
|
|
79
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import type { INodePoint } from '@yozora/character'
|
|
2
|
+
import { AsciiCodePoint } from '@yozora/character'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Eat block html start condition 2:
|
|
6
|
+
*
|
|
7
|
+
* Line begins with the string `<!--`.
|
|
8
|
+
*
|
|
9
|
+
* @param nodePoints
|
|
10
|
+
* @param startIndex
|
|
11
|
+
* @param endIndex
|
|
12
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
13
|
+
*/
|
|
14
|
+
export function eatStartCondition2(
|
|
15
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
16
|
+
startIndex: number,
|
|
17
|
+
endIndex: number,
|
|
18
|
+
): number | null {
|
|
19
|
+
const i = startIndex
|
|
20
|
+
if (
|
|
21
|
+
i + 2 < endIndex &&
|
|
22
|
+
nodePoints[i].codePoint === AsciiCodePoint.EXCLAMATION_MARK &&
|
|
23
|
+
nodePoints[i + 1].codePoint === AsciiCodePoint.MINUS_SIGN &&
|
|
24
|
+
nodePoints[i + 2].codePoint === AsciiCodePoint.MINUS_SIGN
|
|
25
|
+
)
|
|
26
|
+
return i + 3
|
|
27
|
+
return null
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Eat block html end condition 2:
|
|
32
|
+
*
|
|
33
|
+
* line contains the string `-->`.
|
|
34
|
+
*
|
|
35
|
+
* @param nodePoints
|
|
36
|
+
* @param startIndex
|
|
37
|
+
* @param endIndex
|
|
38
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
39
|
+
*/
|
|
40
|
+
export function eatEndCondition2(
|
|
41
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
42
|
+
startIndex: number,
|
|
43
|
+
endIndex: number,
|
|
44
|
+
): number | null {
|
|
45
|
+
for (let i = startIndex; i < endIndex; ++i) {
|
|
46
|
+
if (
|
|
47
|
+
nodePoints[i].codePoint === AsciiCodePoint.MINUS_SIGN &&
|
|
48
|
+
i + 2 < endIndex &&
|
|
49
|
+
nodePoints[i + 1].codePoint === AsciiCodePoint.MINUS_SIGN &&
|
|
50
|
+
nodePoints[i + 2].codePoint === AsciiCodePoint.CLOSE_ANGLE
|
|
51
|
+
)
|
|
52
|
+
return i + 3
|
|
53
|
+
}
|
|
54
|
+
return null
|
|
55
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import type { INodePoint } from '@yozora/character'
|
|
2
|
+
import { AsciiCodePoint } from '@yozora/character'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Eat block html start condition 3:
|
|
6
|
+
*
|
|
7
|
+
* line begins with the string `<?`.
|
|
8
|
+
*
|
|
9
|
+
* @param nodePoints
|
|
10
|
+
* @param startIndex
|
|
11
|
+
* @param endIndex
|
|
12
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
13
|
+
*/
|
|
14
|
+
export function eatStartCondition3(
|
|
15
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
16
|
+
startIndex: number,
|
|
17
|
+
endIndex: number,
|
|
18
|
+
): number | null {
|
|
19
|
+
const i = startIndex
|
|
20
|
+
if (i < endIndex && nodePoints[i].codePoint === AsciiCodePoint.QUESTION_MARK) return i + 1
|
|
21
|
+
return null
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Eat block html end condition 3:
|
|
26
|
+
*
|
|
27
|
+
* line contains the string `?>`.
|
|
28
|
+
*
|
|
29
|
+
* @param nodePoints
|
|
30
|
+
* @param startIndex
|
|
31
|
+
* @param endIndex
|
|
32
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
33
|
+
*/
|
|
34
|
+
export function eatEndCondition3(
|
|
35
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
36
|
+
startIndex: number,
|
|
37
|
+
endIndex: number,
|
|
38
|
+
): number | null {
|
|
39
|
+
for (let i = startIndex; i < endIndex; ++i) {
|
|
40
|
+
if (
|
|
41
|
+
nodePoints[i].codePoint === AsciiCodePoint.QUESTION_MARK &&
|
|
42
|
+
i + 1 < endIndex &&
|
|
43
|
+
nodePoints[i + 1].codePoint === AsciiCodePoint.CLOSE_ANGLE
|
|
44
|
+
)
|
|
45
|
+
return i + 2
|
|
46
|
+
}
|
|
47
|
+
return null
|
|
48
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import type { INodePoint } from '@yozora/character'
|
|
2
|
+
import { AsciiCodePoint, isAsciiUpperLetter } from '@yozora/character'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Eat block html start condition 4:
|
|
6
|
+
*
|
|
7
|
+
* line begins with the string `<!` followed by an uppercase ASCII letter.
|
|
8
|
+
*
|
|
9
|
+
* @param nodePoints
|
|
10
|
+
* @param startIndex
|
|
11
|
+
* @param endIndex
|
|
12
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
13
|
+
*/
|
|
14
|
+
export function eatStartCondition4(
|
|
15
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
16
|
+
startIndex: number,
|
|
17
|
+
endIndex: number,
|
|
18
|
+
): number | null {
|
|
19
|
+
const i = startIndex
|
|
20
|
+
if (
|
|
21
|
+
i + 1 < endIndex &&
|
|
22
|
+
nodePoints[i].codePoint === AsciiCodePoint.EXCLAMATION_MARK &&
|
|
23
|
+
isAsciiUpperLetter(nodePoints[i + 1].codePoint)
|
|
24
|
+
)
|
|
25
|
+
return i + 2
|
|
26
|
+
return null
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Eat block html end condition 4:
|
|
31
|
+
*
|
|
32
|
+
* line contains the character >.
|
|
33
|
+
*
|
|
34
|
+
* @param nodePoints
|
|
35
|
+
* @param startIndex
|
|
36
|
+
* @param endIndex
|
|
37
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
38
|
+
*/
|
|
39
|
+
export function eatEndCondition4(
|
|
40
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
41
|
+
startIndex: number,
|
|
42
|
+
endIndex: number,
|
|
43
|
+
): number | null {
|
|
44
|
+
for (let i = startIndex; i < endIndex; ++i) {
|
|
45
|
+
if (nodePoints[i].codePoint === AsciiCodePoint.CLOSE_ANGLE) return i + 1
|
|
46
|
+
}
|
|
47
|
+
return null
|
|
48
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import type { INodePoint } from '@yozora/character'
|
|
2
|
+
import { AsciiCodePoint } from '@yozora/character'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Eat block html start condition 5:
|
|
6
|
+
*
|
|
7
|
+
* line begins with the string `<![CDATA[`.
|
|
8
|
+
*
|
|
9
|
+
* @param nodePoints
|
|
10
|
+
* @param startIndex
|
|
11
|
+
* @param endIndex
|
|
12
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
13
|
+
*/
|
|
14
|
+
export function eatStartCondition5(
|
|
15
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
16
|
+
startIndex: number,
|
|
17
|
+
endIndex: number,
|
|
18
|
+
): number | null {
|
|
19
|
+
const i = startIndex
|
|
20
|
+
if (
|
|
21
|
+
i + 6 < endIndex &&
|
|
22
|
+
nodePoints[i].codePoint === AsciiCodePoint.EXCLAMATION_MARK &&
|
|
23
|
+
nodePoints[i + 1].codePoint === AsciiCodePoint.OPEN_BRACKET &&
|
|
24
|
+
nodePoints[i + 2].codePoint === AsciiCodePoint.UPPERCASE_C &&
|
|
25
|
+
nodePoints[i + 3].codePoint === AsciiCodePoint.UPPERCASE_D &&
|
|
26
|
+
nodePoints[i + 4].codePoint === AsciiCodePoint.UPPERCASE_A &&
|
|
27
|
+
nodePoints[i + 5].codePoint === AsciiCodePoint.UPPERCASE_T &&
|
|
28
|
+
nodePoints[i + 6].codePoint === AsciiCodePoint.UPPERCASE_A
|
|
29
|
+
)
|
|
30
|
+
return i + 7
|
|
31
|
+
return null
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Eat block html end condition 5:
|
|
36
|
+
*
|
|
37
|
+
* line contains the string `]]>`.
|
|
38
|
+
*
|
|
39
|
+
* @param nodePoints
|
|
40
|
+
* @param startIndex
|
|
41
|
+
* @param endIndex
|
|
42
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
43
|
+
*/
|
|
44
|
+
export function eatEndCondition5(
|
|
45
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
46
|
+
startIndex: number,
|
|
47
|
+
endIndex: number,
|
|
48
|
+
): number | null {
|
|
49
|
+
for (let i = startIndex; i < endIndex; ++i) {
|
|
50
|
+
if (
|
|
51
|
+
nodePoints[i].codePoint === AsciiCodePoint.CLOSE_BRACKET &&
|
|
52
|
+
i + 2 < endIndex &&
|
|
53
|
+
nodePoints[i + 1].codePoint === AsciiCodePoint.CLOSE_BRACKET &&
|
|
54
|
+
nodePoints[i + 2].codePoint === AsciiCodePoint.CLOSE_ANGLE
|
|
55
|
+
)
|
|
56
|
+
return i + 3
|
|
57
|
+
}
|
|
58
|
+
return null
|
|
59
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import type { INodePoint } from '@yozora/character'
|
|
2
|
+
import { AsciiCodePoint, isWhitespaceCharacter } from '@yozora/character'
|
|
3
|
+
|
|
4
|
+
const includedTags = [
|
|
5
|
+
'address',
|
|
6
|
+
'article',
|
|
7
|
+
'aside',
|
|
8
|
+
'base',
|
|
9
|
+
'basefont',
|
|
10
|
+
'blockquote',
|
|
11
|
+
'body',
|
|
12
|
+
'caption',
|
|
13
|
+
'center',
|
|
14
|
+
'col',
|
|
15
|
+
'colgroup',
|
|
16
|
+
'dd',
|
|
17
|
+
'details',
|
|
18
|
+
'dialog',
|
|
19
|
+
'dir',
|
|
20
|
+
'div',
|
|
21
|
+
'dl',
|
|
22
|
+
'dt',
|
|
23
|
+
'fieldset',
|
|
24
|
+
'figcaption',
|
|
25
|
+
'figure',
|
|
26
|
+
'footer',
|
|
27
|
+
'form',
|
|
28
|
+
'frame',
|
|
29
|
+
'frameset',
|
|
30
|
+
'h1',
|
|
31
|
+
'h2',
|
|
32
|
+
'h3',
|
|
33
|
+
'h4',
|
|
34
|
+
'h5',
|
|
35
|
+
'h6',
|
|
36
|
+
'head',
|
|
37
|
+
'header',
|
|
38
|
+
'hr',
|
|
39
|
+
'html',
|
|
40
|
+
'iframe',
|
|
41
|
+
'legend',
|
|
42
|
+
'li',
|
|
43
|
+
'link',
|
|
44
|
+
'main',
|
|
45
|
+
'menu',
|
|
46
|
+
'menuitem',
|
|
47
|
+
'nav',
|
|
48
|
+
'noframes',
|
|
49
|
+
'ol',
|
|
50
|
+
'optgroup',
|
|
51
|
+
'option',
|
|
52
|
+
'p',
|
|
53
|
+
'param',
|
|
54
|
+
'section',
|
|
55
|
+
'source',
|
|
56
|
+
'summary',
|
|
57
|
+
'table',
|
|
58
|
+
'tbody',
|
|
59
|
+
'td',
|
|
60
|
+
'tfoot',
|
|
61
|
+
'th',
|
|
62
|
+
'thead',
|
|
63
|
+
'title',
|
|
64
|
+
'tr',
|
|
65
|
+
'track',
|
|
66
|
+
'ul',
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Eat block html start condition 6:
|
|
71
|
+
*
|
|
72
|
+
* line begins the string `<` or `</` followed by one of
|
|
73
|
+
* the strings (case-insensitive) `address`, `article`, `aside`, `base`,
|
|
74
|
+
* `basefont`, `blockquote`, `body`, `caption`, `center`, `col`, `colgroup`,
|
|
75
|
+
* `dd`, `details`, `dialog`, `dir`, `div`, `dl`, `dt`, `fieldset`,
|
|
76
|
+
* `figcaption`, `figure`, `footer`, `form`, `frame`, `frameset`, `h1`,
|
|
77
|
+
* `h2`, `h3`, `h4`, `h5`, `h6`, `head`, `header`, `hr`, `html`, `iframe`,
|
|
78
|
+
* `legend`, `li`, `link`, `main`, `menu`, `menuitem`, `nav`, `noframes`,
|
|
79
|
+
* `ol`, `optgroup`, `option`, `p`, `param`, `section`, `source`, `summary`,
|
|
80
|
+
* `table`, `tbody`, `td`, `tfoot`, `th`, `thead`, `title`, `tr`, `track`,
|
|
81
|
+
* `ul`, followed by whitespace, the end of the line, the string `>`,
|
|
82
|
+
* or the string `/>`.
|
|
83
|
+
*
|
|
84
|
+
* @param nodePoints
|
|
85
|
+
* @param startIndex
|
|
86
|
+
* @param endIndex
|
|
87
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
88
|
+
*/
|
|
89
|
+
export function eatStartCondition6(
|
|
90
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
91
|
+
startIndex: number,
|
|
92
|
+
endIndex: number,
|
|
93
|
+
tagName: string,
|
|
94
|
+
): number | null {
|
|
95
|
+
if (!includedTags.includes(tagName)) return null
|
|
96
|
+
if (startIndex >= endIndex) return endIndex
|
|
97
|
+
|
|
98
|
+
const c = nodePoints[startIndex].codePoint
|
|
99
|
+
if (isWhitespaceCharacter(c) || c === AsciiCodePoint.CLOSE_ANGLE) return startIndex + 1
|
|
100
|
+
|
|
101
|
+
if (
|
|
102
|
+
c === AsciiCodePoint.SLASH &&
|
|
103
|
+
startIndex + 1 < endIndex &&
|
|
104
|
+
nodePoints[startIndex + 1].codePoint === AsciiCodePoint.CLOSE_ANGLE
|
|
105
|
+
)
|
|
106
|
+
return startIndex + 2
|
|
107
|
+
|
|
108
|
+
return null
|
|
109
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import type { INodePoint } from '@yozora/character'
|
|
2
|
+
import { AsciiCodePoint, isWhitespaceCharacter } from '@yozora/character'
|
|
3
|
+
import { eatOptionalWhitespaces } from '@yozora/core-tokenizer'
|
|
4
|
+
import { eatHTMLAttribute } from '../util/eat-html-attribute'
|
|
5
|
+
|
|
6
|
+
const excludedTags = ['pre', 'script', 'style']
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Eat block html start condition 7:
|
|
10
|
+
*
|
|
11
|
+
* line begins with a complete open tag (with any tag name
|
|
12
|
+
* other than `script`, `style`, or `pre`) or a complete closing tag,
|
|
13
|
+
* followed only by whitespace or the end of the line
|
|
14
|
+
*
|
|
15
|
+
* @param nodePoints
|
|
16
|
+
* @param startIndex
|
|
17
|
+
* @param endIndex
|
|
18
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
19
|
+
*/
|
|
20
|
+
export function eatStartCondition7(
|
|
21
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
22
|
+
startIndex: number,
|
|
23
|
+
endIndex: number,
|
|
24
|
+
tagName: string,
|
|
25
|
+
potentialOpenTag: boolean,
|
|
26
|
+
): number | null {
|
|
27
|
+
if (excludedTags.includes(tagName) || startIndex >= endIndex) return null
|
|
28
|
+
|
|
29
|
+
let i = startIndex
|
|
30
|
+
|
|
31
|
+
if (potentialOpenTag) {
|
|
32
|
+
// Try to resolve an open tag.
|
|
33
|
+
for (; i < endIndex; ) {
|
|
34
|
+
const result = eatHTMLAttribute(nodePoints, i, endIndex)
|
|
35
|
+
if (result == null) break
|
|
36
|
+
i = result.nextIndex
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
i = eatOptionalWhitespaces(nodePoints, i, endIndex)
|
|
40
|
+
if (i >= endIndex) return null
|
|
41
|
+
|
|
42
|
+
if (nodePoints[i].codePoint === AsciiCodePoint.SLASH) i += 1
|
|
43
|
+
} else {
|
|
44
|
+
// Try to resolve a closing tag.
|
|
45
|
+
i = eatOptionalWhitespaces(nodePoints, startIndex, endIndex)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (i >= endIndex || nodePoints[i].codePoint !== AsciiCodePoint.CLOSE_ANGLE) return null
|
|
49
|
+
|
|
50
|
+
for (i += 1; i < endIndex; ++i) {
|
|
51
|
+
if (!isWhitespaceCharacter(nodePoints[i].codePoint)) return null
|
|
52
|
+
}
|
|
53
|
+
return endIndex
|
|
54
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export * from './util/eat-html-attribute'
|
|
2
|
+
export * from './util/eat-html-tagname'
|
|
3
|
+
export { match as htmlBlockMatch } from './match'
|
|
4
|
+
export { parse as htmlBlockParse } from './parse'
|
|
5
|
+
export { HtmlBlockTokenizer, HtmlBlockTokenizer as default } from './tokenizer'
|
|
6
|
+
export { uniqueName as HtmlBlockTokenizerName } from './types'
|
|
7
|
+
export type {
|
|
8
|
+
IThis as IHtmlBlockHookContext,
|
|
9
|
+
IToken as IHtmlBlockToken,
|
|
10
|
+
ITokenizerProps as IHtmlBlockTokenizerProps,
|
|
11
|
+
} from './types'
|
package/src/match.ts
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
import { HtmlType } from '@yozora/ast'
|
|
2
|
+
import type { INodeInterval, INodePoint } from '@yozora/character'
|
|
3
|
+
import { AsciiCodePoint, calcStringFromNodePoints } from '@yozora/character'
|
|
4
|
+
import type {
|
|
5
|
+
IMatchBlockHookCreator,
|
|
6
|
+
IPhrasingContentLine,
|
|
7
|
+
IResultOfEatAndInterruptPreviousSibling,
|
|
8
|
+
IResultOfEatContinuationText,
|
|
9
|
+
IResultOfEatOpener,
|
|
10
|
+
IYastBlockToken,
|
|
11
|
+
} from '@yozora/core-tokenizer'
|
|
12
|
+
import { calcEndPoint, calcStartPoint, eatOptionalWhitespaces } from '@yozora/core-tokenizer'
|
|
13
|
+
import { eatEndCondition1, eatStartCondition1 } from './conditions/c1'
|
|
14
|
+
import { eatEndCondition2, eatStartCondition2 } from './conditions/c2'
|
|
15
|
+
import { eatEndCondition3, eatStartCondition3 } from './conditions/c3'
|
|
16
|
+
import { eatEndCondition4, eatStartCondition4 } from './conditions/c4'
|
|
17
|
+
import { eatEndCondition5, eatStartCondition5 } from './conditions/c5'
|
|
18
|
+
import { eatStartCondition6 } from './conditions/c6'
|
|
19
|
+
import { eatStartCondition7 } from './conditions/c7'
|
|
20
|
+
import type { HtmlBlockConditionType, IThis, IToken, T } from './types'
|
|
21
|
+
import { eatHTMLTagName } from './util/eat-html-tagname'
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* An HTML block is a group of lines that is treated as raw HTML (and will not
|
|
25
|
+
* be escaped in HTML output).
|
|
26
|
+
*
|
|
27
|
+
* @see https://github.com/syntax-tree/mdast#html
|
|
28
|
+
* @see https://github.github.com/gfm/#html-blocks
|
|
29
|
+
*/
|
|
30
|
+
export const match: IMatchBlockHookCreator<T, IToken, IThis> = function () {
|
|
31
|
+
return {
|
|
32
|
+
isContainingBlock: false,
|
|
33
|
+
eatOpener,
|
|
34
|
+
eatAndInterruptPreviousSibling,
|
|
35
|
+
eatContinuationText,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function eatOpener(line: Readonly<IPhrasingContentLine>): IResultOfEatOpener<T, IToken> {
|
|
39
|
+
/**
|
|
40
|
+
* The opening tag can be indented 1-3 spaces, but not 4.
|
|
41
|
+
* @see https://github.github.com/gfm/#example-152
|
|
42
|
+
*/
|
|
43
|
+
if (line.countOfPrecedeSpaces >= 4) return null
|
|
44
|
+
|
|
45
|
+
const { nodePoints, startIndex, endIndex, firstNonWhitespaceIndex } = line
|
|
46
|
+
if (
|
|
47
|
+
firstNonWhitespaceIndex >= endIndex ||
|
|
48
|
+
nodePoints[firstNonWhitespaceIndex].codePoint !== AsciiCodePoint.OPEN_ANGLE
|
|
49
|
+
)
|
|
50
|
+
return null
|
|
51
|
+
|
|
52
|
+
const i = firstNonWhitespaceIndex + 1
|
|
53
|
+
const startResult = eatStartCondition(nodePoints, i, endIndex)
|
|
54
|
+
if (startResult == null) return null
|
|
55
|
+
|
|
56
|
+
const { condition } = startResult
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* The end tag can occur on the same line as the start tag.
|
|
60
|
+
* @see https://github.github.com/gfm/#example-145
|
|
61
|
+
* @see https://github.github.com/gfm/#example-146
|
|
62
|
+
*/
|
|
63
|
+
let saturated = false
|
|
64
|
+
if (condition !== 6 && condition !== 7) {
|
|
65
|
+
const endResult = eatEndCondition(nodePoints, startResult.nextIndex, endIndex, condition)
|
|
66
|
+
if (endResult != null) saturated = true
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const nextIndex = endIndex
|
|
70
|
+
const token: IToken = {
|
|
71
|
+
nodeType: HtmlType,
|
|
72
|
+
position: {
|
|
73
|
+
start: calcStartPoint(nodePoints, startIndex),
|
|
74
|
+
end: calcEndPoint(nodePoints, nextIndex - 1),
|
|
75
|
+
},
|
|
76
|
+
condition,
|
|
77
|
+
lines: [line],
|
|
78
|
+
}
|
|
79
|
+
return { token, nextIndex, saturated }
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function eatAndInterruptPreviousSibling(
|
|
83
|
+
line: Readonly<IPhrasingContentLine>,
|
|
84
|
+
prevSiblingToken: Readonly<IYastBlockToken>,
|
|
85
|
+
): IResultOfEatAndInterruptPreviousSibling<T, IToken> {
|
|
86
|
+
const result = eatOpener(line)
|
|
87
|
+
if (result == null || result.token.condition === 7) return null
|
|
88
|
+
const { token, nextIndex } = result
|
|
89
|
+
return {
|
|
90
|
+
token,
|
|
91
|
+
nextIndex,
|
|
92
|
+
remainingSibling: prevSiblingToken,
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function eatContinuationText(
|
|
97
|
+
line: Readonly<IPhrasingContentLine>,
|
|
98
|
+
token: IToken,
|
|
99
|
+
): IResultOfEatContinuationText {
|
|
100
|
+
const { nodePoints, endIndex, firstNonWhitespaceIndex } = line
|
|
101
|
+
const nextIndex = eatEndCondition(
|
|
102
|
+
nodePoints,
|
|
103
|
+
firstNonWhitespaceIndex,
|
|
104
|
+
endIndex,
|
|
105
|
+
token.condition,
|
|
106
|
+
)
|
|
107
|
+
if (nextIndex === -1) return { status: 'notMatched' }
|
|
108
|
+
|
|
109
|
+
token.lines.push(line)
|
|
110
|
+
if (nextIndex != null) return { status: 'closing', nextIndex: endIndex }
|
|
111
|
+
return { status: 'opening', nextIndex: endIndex }
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function eatStartCondition(
|
|
115
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
116
|
+
startIndex: number,
|
|
117
|
+
endIndex: number,
|
|
118
|
+
): { condition: HtmlBlockConditionType; nextIndex: number } | null {
|
|
119
|
+
let nextIndex: number | null = null
|
|
120
|
+
if (startIndex >= endIndex) return null
|
|
121
|
+
|
|
122
|
+
// condition 2
|
|
123
|
+
nextIndex = eatStartCondition2(nodePoints, startIndex, endIndex)
|
|
124
|
+
if (nextIndex != null) return { nextIndex, condition: 2 }
|
|
125
|
+
|
|
126
|
+
// condition 3
|
|
127
|
+
nextIndex = eatStartCondition3(nodePoints, startIndex, endIndex)
|
|
128
|
+
if (nextIndex != null) return { nextIndex, condition: 3 }
|
|
129
|
+
|
|
130
|
+
// condition 4
|
|
131
|
+
nextIndex = eatStartCondition4(nodePoints, startIndex, endIndex)
|
|
132
|
+
if (nextIndex != null) return { nextIndex, condition: 4 }
|
|
133
|
+
|
|
134
|
+
// condition 5
|
|
135
|
+
nextIndex = eatStartCondition5(nodePoints, startIndex, endIndex)
|
|
136
|
+
if (nextIndex != null) return { nextIndex, condition: 5 }
|
|
137
|
+
|
|
138
|
+
if (nodePoints[startIndex].codePoint !== AsciiCodePoint.SLASH) {
|
|
139
|
+
const tagNameStartIndex = startIndex
|
|
140
|
+
const tagNameEndIndex = eatHTMLTagName(nodePoints, tagNameStartIndex, endIndex)
|
|
141
|
+
if (tagNameEndIndex == null) return null
|
|
142
|
+
|
|
143
|
+
const tagNameInterval: INodeInterval = {
|
|
144
|
+
startIndex: tagNameStartIndex,
|
|
145
|
+
endIndex: tagNameEndIndex,
|
|
146
|
+
}
|
|
147
|
+
const rawTagName = calcStringFromNodePoints(
|
|
148
|
+
nodePoints,
|
|
149
|
+
tagNameInterval.startIndex,
|
|
150
|
+
tagNameInterval.endIndex,
|
|
151
|
+
)
|
|
152
|
+
const tagName = rawTagName.toLowerCase()
|
|
153
|
+
|
|
154
|
+
// condition1
|
|
155
|
+
nextIndex = eatStartCondition1(nodePoints, tagNameInterval.endIndex, endIndex, tagName)
|
|
156
|
+
if (nextIndex != null) return { nextIndex, condition: 1 }
|
|
157
|
+
|
|
158
|
+
// condition 6
|
|
159
|
+
nextIndex = eatStartCondition6(nodePoints, tagNameInterval.endIndex, endIndex, tagName)
|
|
160
|
+
if (nextIndex != null) return { nextIndex, condition: 6 }
|
|
161
|
+
|
|
162
|
+
// condition 7
|
|
163
|
+
nextIndex = eatStartCondition7(nodePoints, tagNameInterval.endIndex, endIndex, tagName, true)
|
|
164
|
+
if (nextIndex != null) return { nextIndex, condition: 7 }
|
|
165
|
+
|
|
166
|
+
// fallback
|
|
167
|
+
return null
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const tagNameStartIndex = startIndex + 1
|
|
171
|
+
const tagNameEndIndex = eatHTMLTagName(nodePoints, tagNameStartIndex, endIndex)
|
|
172
|
+
if (tagNameEndIndex == null) return null
|
|
173
|
+
|
|
174
|
+
const tagNameInterval: INodeInterval = {
|
|
175
|
+
startIndex: tagNameStartIndex,
|
|
176
|
+
endIndex: tagNameEndIndex,
|
|
177
|
+
}
|
|
178
|
+
const rawTagName = calcStringFromNodePoints(
|
|
179
|
+
nodePoints,
|
|
180
|
+
tagNameInterval.startIndex,
|
|
181
|
+
tagNameInterval.endIndex,
|
|
182
|
+
)
|
|
183
|
+
const tagName = rawTagName.toLowerCase()
|
|
184
|
+
|
|
185
|
+
// condition 6
|
|
186
|
+
nextIndex = eatStartCondition6(nodePoints, tagNameInterval.endIndex, endIndex, tagName)
|
|
187
|
+
if (nextIndex != null) return { nextIndex, condition: 6 }
|
|
188
|
+
|
|
189
|
+
// condition 7.
|
|
190
|
+
nextIndex = eatStartCondition7(nodePoints, tagNameInterval.endIndex, endIndex, tagName, false)
|
|
191
|
+
if (nextIndex != null) return { nextIndex, condition: 7 }
|
|
192
|
+
|
|
193
|
+
// fallback
|
|
194
|
+
return null
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
function eatEndCondition(
|
|
198
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
199
|
+
startIndex: number,
|
|
200
|
+
endIndex: number,
|
|
201
|
+
condition: HtmlBlockConditionType,
|
|
202
|
+
): -1 | number | null {
|
|
203
|
+
switch (condition) {
|
|
204
|
+
case 1: {
|
|
205
|
+
const nextIndex = eatEndCondition1(nodePoints, startIndex, endIndex)
|
|
206
|
+
return nextIndex == null ? null : endIndex
|
|
207
|
+
}
|
|
208
|
+
case 2: {
|
|
209
|
+
const nextIndex = eatEndCondition2(nodePoints, startIndex, endIndex)
|
|
210
|
+
return nextIndex == null ? null : endIndex
|
|
211
|
+
}
|
|
212
|
+
case 3: {
|
|
213
|
+
const nextIndex = eatEndCondition3(nodePoints, startIndex, endIndex)
|
|
214
|
+
return nextIndex == null ? null : endIndex
|
|
215
|
+
}
|
|
216
|
+
case 4: {
|
|
217
|
+
const nextIndex = eatEndCondition4(nodePoints, startIndex, endIndex)
|
|
218
|
+
return nextIndex == null ? null : endIndex
|
|
219
|
+
}
|
|
220
|
+
case 5: {
|
|
221
|
+
const nextIndex = eatEndCondition5(nodePoints, startIndex, endIndex)
|
|
222
|
+
return nextIndex == null ? null : endIndex
|
|
223
|
+
}
|
|
224
|
+
case 6:
|
|
225
|
+
case 7: {
|
|
226
|
+
const firstNonWhitespaceIndex = eatOptionalWhitespaces(nodePoints, startIndex, endIndex)
|
|
227
|
+
return firstNonWhitespaceIndex >= endIndex ? -1 : null
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
package/src/parse.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { calcStringFromNodePoints } from '@yozora/character'
|
|
2
|
+
import type { IParseBlockHookCreator } from '@yozora/core-tokenizer'
|
|
3
|
+
import { mergeContentLinesFaithfully } from '@yozora/core-tokenizer'
|
|
4
|
+
import type { INode, IThis, IToken, T } from './types'
|
|
5
|
+
|
|
6
|
+
export const parse: IParseBlockHookCreator<T, IToken, INode, IThis> = function (api) {
|
|
7
|
+
return {
|
|
8
|
+
parse: tokens =>
|
|
9
|
+
tokens.map(token => {
|
|
10
|
+
// Try to build phrasingContent
|
|
11
|
+
const contents = mergeContentLinesFaithfully(token.lines)
|
|
12
|
+
const node: INode = api.shouldReservePosition
|
|
13
|
+
? { type: 'html', position: token.position, value: calcStringFromNodePoints(contents) }
|
|
14
|
+
: { type: 'html', value: calcStringFromNodePoints(contents) }
|
|
15
|
+
return node
|
|
16
|
+
}),
|
|
17
|
+
}
|
|
18
|
+
}
|
package/src/tokenizer.ts
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
IBlockTokenizer,
|
|
3
|
+
IMatchBlockHookCreator,
|
|
4
|
+
IParseBlockHookCreator,
|
|
5
|
+
} from '@yozora/core-tokenizer'
|
|
6
|
+
import { BaseBlockTokenizer, TokenizerPriority } from '@yozora/core-tokenizer'
|
|
7
|
+
import { match } from './match'
|
|
8
|
+
import { parse } from './parse'
|
|
9
|
+
import type { INode, IThis, IToken, ITokenizerProps, T } from './types'
|
|
10
|
+
import { uniqueName } from './types'
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Lexical Analyzer for HtmlBlock.
|
|
14
|
+
* @see https://github.com/syntax-tree/mdast#html
|
|
15
|
+
* @see https://github.github.com/gfm/#html-blocks
|
|
16
|
+
*/
|
|
17
|
+
export class HtmlBlockTokenizer
|
|
18
|
+
extends BaseBlockTokenizer<T, IToken, INode, IThis>
|
|
19
|
+
implements IBlockTokenizer<T, IToken, INode, IThis>
|
|
20
|
+
{
|
|
21
|
+
/* istanbul ignore next */
|
|
22
|
+
constructor(props: ITokenizerProps = {}) {
|
|
23
|
+
super({
|
|
24
|
+
name: props.name ?? uniqueName,
|
|
25
|
+
priority: props.priority ?? TokenizerPriority.ATOMIC,
|
|
26
|
+
})
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
public override readonly match: IMatchBlockHookCreator<T, IToken, IThis> = match
|
|
30
|
+
|
|
31
|
+
public override readonly parse: IParseBlockHookCreator<T, IToken, INode, IThis> = parse
|
|
32
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import type { Html, HtmlType } from '@yozora/ast'
|
|
2
|
+
import type {
|
|
3
|
+
IBaseBlockTokenizerProps,
|
|
4
|
+
IPartialYastBlockToken,
|
|
5
|
+
IPhrasingContentLine,
|
|
6
|
+
ITokenizer,
|
|
7
|
+
} from '@yozora/core-tokenizer'
|
|
8
|
+
|
|
9
|
+
export type T = HtmlType
|
|
10
|
+
export type INode = Html
|
|
11
|
+
export const uniqueName = '@yozora/tokenizer-html-block'
|
|
12
|
+
|
|
13
|
+
export type HtmlBlockConditionType = 1 | 2 | 3 | 4 | 5 | 6 | 7
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Middle state during the whole match and parse phase.
|
|
17
|
+
*/
|
|
18
|
+
export interface IToken extends IPartialYastBlockToken<T> {
|
|
19
|
+
/**
|
|
20
|
+
* Number of conditions defined in GFM:
|
|
21
|
+
*
|
|
22
|
+
* 1. Start condition: line begins with the string `<script`, `<pre`, or
|
|
23
|
+
* `<style` (case-insensitive), followed by whitespace, the string `>`,
|
|
24
|
+
* or the end of the line.
|
|
25
|
+
*
|
|
26
|
+
* End condition: line contains an end tag `</script>`, `</pre>`,
|
|
27
|
+
* or `</style>` (case-insensitive; it need not match the start tag).
|
|
28
|
+
*
|
|
29
|
+
* 2. Start condition: line begins with the string `<!--`.
|
|
30
|
+
* End condition: line contains the string `-->`.
|
|
31
|
+
*
|
|
32
|
+
* 3. Start condition: line begins with the string `<?`.
|
|
33
|
+
* End condition: line contains the string `?>`.
|
|
34
|
+
*
|
|
35
|
+
* 4. Start condition: line begins with the string `<!` followed by an
|
|
36
|
+
* uppercase ASCII letter.
|
|
37
|
+
*
|
|
38
|
+
* End condition: line contains the character >.
|
|
39
|
+
*
|
|
40
|
+
* 5. Start condition: line begins with the string `<![CDATA[`.
|
|
41
|
+
* End condition: line contains the string `]]>`.
|
|
42
|
+
*
|
|
43
|
+
* 6. Start condition: line begins the string `<` or `</` followed by one of
|
|
44
|
+
* the strings (case-insensitive) `address`, `article`, `aside`, `base`,
|
|
45
|
+
* `basefont`, `blockquote`, `body`, `caption`, `center`, `col`, `colgroup`,
|
|
46
|
+
* `dd`, `details`, `dialog`, `dir`, `div`, `dl`, `dt`, `fieldset`,
|
|
47
|
+
* `figcaption`, `figure`, `footer`, `form`, `frame`, `frameset`, `h1`,
|
|
48
|
+
* `h2`, `h3`, `h4`, `h5`, `h6`, `head`, `header`, `hr`, `html`, `iframe`,
|
|
49
|
+
* `legend`, `li`, `link`, `main`, `menu`, `menuitem`, `nav`, `noframes`,
|
|
50
|
+
* `ol`, `optgroup`, `option`, `p`, `param`, `section`, `source`, `summary`,
|
|
51
|
+
* `table`, `tbody`, `td`, `tfoot`, `th`, `thead`, `title`, `tr`, `track`,
|
|
52
|
+
* `ul`, followed by whitespace, the end of the line, the string `>`,
|
|
53
|
+
* or the string `/>`.
|
|
54
|
+
*
|
|
55
|
+
* End condition: line is followed by a blank line.
|
|
56
|
+
*
|
|
57
|
+
* 7. Start condition: line begins with a complete open tag (with any tag name
|
|
58
|
+
* other than `script`, `style`, or `pre`) or a complete closing tag,
|
|
59
|
+
* followed only by whitespace or the end of the line.
|
|
60
|
+
*
|
|
61
|
+
* End condition: line is followed by a blank line.
|
|
62
|
+
*
|
|
63
|
+
* @see https://github.github.com/gfm/#start-condition
|
|
64
|
+
*/
|
|
65
|
+
condition: HtmlBlockConditionType
|
|
66
|
+
/**
|
|
67
|
+
* Contents
|
|
68
|
+
*/
|
|
69
|
+
lines: Array<Readonly<IPhrasingContentLine>>
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export type IThis = ITokenizer
|
|
73
|
+
|
|
74
|
+
export type ITokenizerProps = Partial<IBaseBlockTokenizerProps>
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import type { INodeInterval, INodePoint } from '@yozora/character'
|
|
2
|
+
import {
|
|
3
|
+
AsciiCodePoint,
|
|
4
|
+
isAsciiDigitCharacter,
|
|
5
|
+
isAsciiLetter,
|
|
6
|
+
isWhitespaceCharacter,
|
|
7
|
+
} from '@yozora/character'
|
|
8
|
+
import { eatOptionalWhitespaces } from '@yozora/core-tokenizer'
|
|
9
|
+
|
|
10
|
+
export interface RawHTMLAttribute {
|
|
11
|
+
/**
|
|
12
|
+
* Attribute name.
|
|
13
|
+
*/
|
|
14
|
+
name: INodeInterval
|
|
15
|
+
/**
|
|
16
|
+
* Attribute value.
|
|
17
|
+
*/
|
|
18
|
+
value?: INodeInterval
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* An attribute consists of whitespace, an attribute name, and an optional
|
|
23
|
+
* attribute value specification.
|
|
24
|
+
*
|
|
25
|
+
* @param nodePoints
|
|
26
|
+
* @param startIndex
|
|
27
|
+
* @param endIndex
|
|
28
|
+
* @see https://github.github.com/gfm/#attribute
|
|
29
|
+
*/
|
|
30
|
+
export function eatHTMLAttribute(
|
|
31
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
32
|
+
startIndex: number,
|
|
33
|
+
endIndex: number,
|
|
34
|
+
): { attribute: RawHTMLAttribute; nextIndex: number } | null {
|
|
35
|
+
// eat whitespace.
|
|
36
|
+
let i = eatOptionalWhitespaces(nodePoints, startIndex, endIndex)
|
|
37
|
+
if (i <= startIndex || i >= endIndex) return null
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Eat attribute name.
|
|
41
|
+
*
|
|
42
|
+
* An attribute name consists of an ASCII letter, `_`, or `:`, followed by
|
|
43
|
+
* zero or more ASCII letters, digits, `_`, `.`, `:`, or `-`.
|
|
44
|
+
* @see https://github.github.com/gfm/#attribute-name
|
|
45
|
+
*/
|
|
46
|
+
const attrNameStartIndex = i
|
|
47
|
+
let c = nodePoints[i].codePoint
|
|
48
|
+
if (!isAsciiLetter(c) && c !== AsciiCodePoint.UNDERSCORE && c !== AsciiCodePoint.COLON)
|
|
49
|
+
return null
|
|
50
|
+
for (i = attrNameStartIndex + 1; i < endIndex; ++i) {
|
|
51
|
+
c = nodePoints[i].codePoint
|
|
52
|
+
if (
|
|
53
|
+
isAsciiLetter(c) ||
|
|
54
|
+
isAsciiDigitCharacter(c) ||
|
|
55
|
+
c === AsciiCodePoint.UNDERSCORE ||
|
|
56
|
+
c === AsciiCodePoint.DOT ||
|
|
57
|
+
c === AsciiCodePoint.COLON ||
|
|
58
|
+
c === AsciiCodePoint.MINUS_SIGN
|
|
59
|
+
)
|
|
60
|
+
continue
|
|
61
|
+
break
|
|
62
|
+
}
|
|
63
|
+
const attrNameEndIndex = i
|
|
64
|
+
|
|
65
|
+
const attribute: RawHTMLAttribute = {
|
|
66
|
+
name: {
|
|
67
|
+
startIndex: attrNameStartIndex,
|
|
68
|
+
endIndex: attrNameEndIndex,
|
|
69
|
+
},
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Eat attribute value.
|
|
74
|
+
*
|
|
75
|
+
* An attribute value specification consists of optional whitespace, a `=`
|
|
76
|
+
* character, optional whitespace, and an attribute value.
|
|
77
|
+
*
|
|
78
|
+
* An attribute value consists of an unquoted attribute value, a single-quoted
|
|
79
|
+
* attribute value, or a double-quoted attribute value.
|
|
80
|
+
*
|
|
81
|
+
* @see https://github.github.com/gfm/#attribute-value-specification
|
|
82
|
+
* @see https://github.github.com/gfm/#attribute-value
|
|
83
|
+
*/
|
|
84
|
+
i = eatOptionalWhitespaces(nodePoints, attrNameEndIndex, endIndex)
|
|
85
|
+
if (i < endIndex && nodePoints[i].codePoint === AsciiCodePoint.EQUALS_SIGN) {
|
|
86
|
+
i = eatOptionalWhitespaces(nodePoints, i + 1, endIndex)
|
|
87
|
+
if (i < endIndex) {
|
|
88
|
+
const mark = nodePoints[i].codePoint
|
|
89
|
+
switch (mark) {
|
|
90
|
+
/**
|
|
91
|
+
* A double-quoted attribute value consists of `"`, zero or more
|
|
92
|
+
* characters not including `"`, and a final `"`.
|
|
93
|
+
* @see https://github.github.com/gfm/#double-quoted-attribute-value
|
|
94
|
+
*/
|
|
95
|
+
case AsciiCodePoint.DOUBLE_QUOTE: {
|
|
96
|
+
const attrValueStartIndex = i + 1
|
|
97
|
+
for (i = attrValueStartIndex; i < endIndex; ++i) {
|
|
98
|
+
c = nodePoints[i].codePoint
|
|
99
|
+
if (c === AsciiCodePoint.DOUBLE_QUOTE) break
|
|
100
|
+
}
|
|
101
|
+
const attrValueEndIndex = i
|
|
102
|
+
if (i < endIndex && nodePoints[i].codePoint === AsciiCodePoint.DOUBLE_QUOTE) {
|
|
103
|
+
attribute.value = {
|
|
104
|
+
startIndex: attrValueStartIndex,
|
|
105
|
+
endIndex: attrValueEndIndex,
|
|
106
|
+
}
|
|
107
|
+
i += 1
|
|
108
|
+
}
|
|
109
|
+
break
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* A single-quoted attribute value consists of `'`, zero or more
|
|
113
|
+
* characters not including `'`, and a final `'`.
|
|
114
|
+
* @see https://github.github.com/gfm/#single-quoted-attribute-value
|
|
115
|
+
*/
|
|
116
|
+
case AsciiCodePoint.SINGLE_QUOTE: {
|
|
117
|
+
const attrValueStartIndex = i + 1
|
|
118
|
+
for (i = attrValueStartIndex; i < endIndex; ++i) {
|
|
119
|
+
c = nodePoints[i].codePoint
|
|
120
|
+
if (c === AsciiCodePoint.SINGLE_QUOTE) break
|
|
121
|
+
}
|
|
122
|
+
const attrValueEndIndex = i
|
|
123
|
+
if (i < endIndex && nodePoints[i].codePoint === AsciiCodePoint.SINGLE_QUOTE) {
|
|
124
|
+
attribute.value = {
|
|
125
|
+
startIndex: attrValueStartIndex,
|
|
126
|
+
endIndex: attrValueEndIndex,
|
|
127
|
+
}
|
|
128
|
+
i += 1
|
|
129
|
+
}
|
|
130
|
+
break
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* An unquoted attribute value is a nonempty string of characters not
|
|
134
|
+
* including whitespace, `"`, `'`, `=`, `<`, `>`, or `\``.
|
|
135
|
+
* @see https://github.github.com/gfm/#unquoted-attribute-value
|
|
136
|
+
*/
|
|
137
|
+
default: {
|
|
138
|
+
const attrValueStartIndex = i
|
|
139
|
+
for (; i < endIndex; ++i) {
|
|
140
|
+
c = nodePoints[i].codePoint
|
|
141
|
+
if (
|
|
142
|
+
isWhitespaceCharacter(c) ||
|
|
143
|
+
c === AsciiCodePoint.DOUBLE_QUOTE ||
|
|
144
|
+
c === AsciiCodePoint.SINGLE_QUOTE ||
|
|
145
|
+
c === AsciiCodePoint.EQUALS_SIGN ||
|
|
146
|
+
c === AsciiCodePoint.OPEN_ANGLE ||
|
|
147
|
+
c === AsciiCodePoint.CLOSE_ANGLE ||
|
|
148
|
+
c === AsciiCodePoint.BACKTICK
|
|
149
|
+
)
|
|
150
|
+
break
|
|
151
|
+
}
|
|
152
|
+
const attrValueEndIndex = i
|
|
153
|
+
if (attrValueEndIndex > attrValueStartIndex) {
|
|
154
|
+
attribute.value = {
|
|
155
|
+
startIndex: attrValueStartIndex,
|
|
156
|
+
endIndex: attrValueEndIndex,
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
break
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
if (attribute.value != null) {
|
|
164
|
+
return { attribute, nextIndex: i }
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return { attribute, nextIndex: attrNameEndIndex }
|
|
170
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { INodePoint } from '@yozora/character'
|
|
2
|
+
import { AsciiCodePoint, isAsciiDigitCharacter, isAsciiLetter } from '@yozora/character'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* A tag name consists of an ASCII letter followed by zero or more ASCII
|
|
6
|
+
* letters, digits, or hyphens (-).
|
|
7
|
+
*
|
|
8
|
+
* @param nodePoints
|
|
9
|
+
* @param startIndex
|
|
10
|
+
* @param endIndex
|
|
11
|
+
* @see https://github.github.com/gfm/#tag-name
|
|
12
|
+
*/
|
|
13
|
+
export function eatHTMLTagName(
|
|
14
|
+
nodePoints: ReadonlyArray<INodePoint>,
|
|
15
|
+
startIndex: number,
|
|
16
|
+
endIndex: number,
|
|
17
|
+
): number | null {
|
|
18
|
+
if (startIndex >= endIndex || !isAsciiLetter(nodePoints[startIndex].codePoint)) return null
|
|
19
|
+
|
|
20
|
+
let i = startIndex
|
|
21
|
+
for (; i < endIndex; ++i) {
|
|
22
|
+
const c = nodePoints[i].codePoint
|
|
23
|
+
if (isAsciiLetter(c) || isAsciiDigitCharacter(c) || c === AsciiCodePoint.MINUS_SIGN) continue
|
|
24
|
+
return i
|
|
25
|
+
}
|
|
26
|
+
return i
|
|
27
|
+
}
|