@nitpicker/analyze-main-contents 0.4.4 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/main-contents-plugin.d.ts +48 -0
- package/lib/main-contents-plugin.js +165 -0
- package/package.json +6 -6
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plugin options for the main-contents detection analysis.
|
|
3
|
+
*/
|
|
4
|
+
type Options = {
|
|
5
|
+
/**
|
|
6
|
+
* A custom CSS selector to identify the main content element.
|
|
7
|
+
* When provided, it is prepended to the default selector list
|
|
8
|
+
* (highest priority) so that the custom selector is tried first.
|
|
9
|
+
*/
|
|
10
|
+
mainContentSelector?: string | null;
|
|
11
|
+
};
|
|
12
|
+
/**
|
|
13
|
+
* Analyze plugin that detects the main content area of each page and
|
|
14
|
+
* extracts structural metrics: word count, headings, images, and tables.
|
|
15
|
+
*
|
|
16
|
+
* ## Main content detection strategy
|
|
17
|
+
*
|
|
18
|
+
* The plugin uses a priority-ordered list of CSS selectors to locate
|
|
19
|
+
* the main content element. The selectors are joined with commas into
|
|
20
|
+
* a single `document.querySelector()` call, so the browser returns the
|
|
21
|
+
* first matching element in DOM order among all selectors:
|
|
22
|
+
*
|
|
23
|
+
* 1. User-supplied `mainContentSelector` (highest priority, prepended via `unshift`)
|
|
24
|
+
* 2. `<main>` element (semantic HTML5)
|
|
25
|
+
* 3. `[role="main"]` (WAI-ARIA landmark)
|
|
26
|
+
* 4. Common id/class patterns: `#main`, `.main`, `#content`, `.content`,
|
|
27
|
+
* `#contents`, `.contents`, `#main-content`, `.main-content`,
|
|
28
|
+
* `#main_content`, `.main_content`, `#mainContent`, `.mainContent`
|
|
29
|
+
*
|
|
30
|
+
* This ordering reflects real-world convention: semantic elements are
|
|
31
|
+
* preferred over id/class-based heuristics, and the user's explicit
|
|
32
|
+
* selector always wins.
|
|
33
|
+
* @example
|
|
34
|
+
* ```jsonc
|
|
35
|
+
* // nitpicker.config.json
|
|
36
|
+
* {
|
|
37
|
+
* "plugins": {
|
|
38
|
+
* "analyze": {
|
|
39
|
+
* "@nitpicker/analyze-main-contents": {
|
|
40
|
+
* "mainContentSelector": "#page-body"
|
|
41
|
+
* }
|
|
42
|
+
* }
|
|
43
|
+
* }
|
|
44
|
+
* }
|
|
45
|
+
* ```
|
|
46
|
+
*/
|
|
47
|
+
declare const _default: import("@nitpicker/core").PluginFactory<Options, "wordCount" | "headings" | "images" | "table">;
|
|
48
|
+
export default _default;
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import { finder } from '@medv/finder'; // cspell:disable-line
|
|
2
|
+
import { definePlugin } from '@nitpicker/core';
|
|
3
|
+
/**
|
|
4
|
+
* Analyze plugin that detects the main content area of each page and
|
|
5
|
+
* extracts structural metrics: word count, headings, images, and tables.
|
|
6
|
+
*
|
|
7
|
+
* ## Main content detection strategy
|
|
8
|
+
*
|
|
9
|
+
* The plugin uses a priority-ordered list of CSS selectors to locate
|
|
10
|
+
* the main content element. The selectors are joined with commas into
|
|
11
|
+
* a single `document.querySelector()` call, so the browser returns the
|
|
12
|
+
* first matching element in DOM order among all selectors:
|
|
13
|
+
*
|
|
14
|
+
* 1. User-supplied `mainContentSelector` (highest priority, prepended via `unshift`)
|
|
15
|
+
* 2. `<main>` element (semantic HTML5)
|
|
16
|
+
* 3. `[role="main"]` (WAI-ARIA landmark)
|
|
17
|
+
* 4. Common id/class patterns: `#main`, `.main`, `#content`, `.content`,
|
|
18
|
+
* `#contents`, `.contents`, `#main-content`, `.main-content`,
|
|
19
|
+
* `#main_content`, `.main_content`, `#mainContent`, `.mainContent`
|
|
20
|
+
*
|
|
21
|
+
* This ordering reflects real-world convention: semantic elements are
|
|
22
|
+
* preferred over id/class-based heuristics, and the user's explicit
|
|
23
|
+
* selector always wins.
|
|
24
|
+
* @example
|
|
25
|
+
* ```jsonc
|
|
26
|
+
* // nitpicker.config.json
|
|
27
|
+
* {
|
|
28
|
+
* "plugins": {
|
|
29
|
+
* "analyze": {
|
|
30
|
+
* "@nitpicker/analyze-main-contents": {
|
|
31
|
+
* "mainContentSelector": "#page-body"
|
|
32
|
+
* }
|
|
33
|
+
* }
|
|
34
|
+
* }
|
|
35
|
+
* }
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
export default definePlugin((options) => {
|
|
39
|
+
return {
|
|
40
|
+
label: 'メインコンテンツ検出',
|
|
41
|
+
headers: {
|
|
42
|
+
wordCount: 'Word count',
|
|
43
|
+
headings: 'Heading count',
|
|
44
|
+
images: 'Image count',
|
|
45
|
+
table: 'Table count',
|
|
46
|
+
},
|
|
47
|
+
eachPage({ window }) {
|
|
48
|
+
const { document } = window;
|
|
49
|
+
const result = {
|
|
50
|
+
title: document.title?.trim(),
|
|
51
|
+
main: null,
|
|
52
|
+
wordCount: 0,
|
|
53
|
+
headings: [],
|
|
54
|
+
images: [],
|
|
55
|
+
paragraphs: [],
|
|
56
|
+
tables: [],
|
|
57
|
+
};
|
|
58
|
+
// Extract main content using priority-ordered selectors
|
|
59
|
+
const selectors = [
|
|
60
|
+
'main',
|
|
61
|
+
'[role="main"]',
|
|
62
|
+
'#main',
|
|
63
|
+
'.main',
|
|
64
|
+
'#content',
|
|
65
|
+
'.content',
|
|
66
|
+
'#contents',
|
|
67
|
+
'.contents',
|
|
68
|
+
'#main-content',
|
|
69
|
+
'.main-content',
|
|
70
|
+
'#main_content',
|
|
71
|
+
'.main_content',
|
|
72
|
+
'#mainContent',
|
|
73
|
+
'.mainContent',
|
|
74
|
+
];
|
|
75
|
+
if (options.mainContentSelector) {
|
|
76
|
+
selectors.unshift(options.mainContentSelector);
|
|
77
|
+
}
|
|
78
|
+
const $main = document.querySelector(selectors.join(','));
|
|
79
|
+
if (!$main) {
|
|
80
|
+
return {
|
|
81
|
+
wordCount: {
|
|
82
|
+
value: result.wordCount,
|
|
83
|
+
},
|
|
84
|
+
headings: {
|
|
85
|
+
value: result.headings.length,
|
|
86
|
+
},
|
|
87
|
+
images: {
|
|
88
|
+
value: result.images.length,
|
|
89
|
+
},
|
|
90
|
+
table: {
|
|
91
|
+
value: result.tables.length,
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
result.main = {
|
|
96
|
+
nodeName: $main.nodeName,
|
|
97
|
+
id: $main.id || null,
|
|
98
|
+
classList: [...$main.classList],
|
|
99
|
+
role: $main.getAttribute('role'),
|
|
100
|
+
selector: finder($main, {
|
|
101
|
+
root: document.body,
|
|
102
|
+
}),
|
|
103
|
+
};
|
|
104
|
+
const textContent = removeSpaces($main.textContent) || '';
|
|
105
|
+
result.wordCount = textContent.length;
|
|
106
|
+
for (const $heading of $main.querySelectorAll('h1, h2, h3, h4, h5, h6')) {
|
|
107
|
+
result.headings.push({
|
|
108
|
+
text: removeSpaces($heading.textContent),
|
|
109
|
+
level: Number.parseInt($heading.nodeName.replace(/h/i, ''), 10),
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
for (const $img of $main.querySelectorAll('img, input[type="image"]')) {
|
|
113
|
+
result.images.push({
|
|
114
|
+
src: $img.src,
|
|
115
|
+
alt: $img.alt,
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
for (const $table of $main.querySelectorAll('table')) {
|
|
119
|
+
result.tables.push({
|
|
120
|
+
rows: $table.querySelectorAll('tr').length,
|
|
121
|
+
cols: $table.querySelector('tr')?.querySelectorAll('th, td').length || 0,
|
|
122
|
+
hasHeader: !!$table.querySelector('thead'),
|
|
123
|
+
hasFooter: !!$table.querySelector('tfoot'),
|
|
124
|
+
hasMergedCell: !!$table.querySelector('[colspan], [rowspan]'),
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
return {
|
|
128
|
+
page: {
|
|
129
|
+
wordCount: {
|
|
130
|
+
value: result.wordCount,
|
|
131
|
+
},
|
|
132
|
+
headings: {
|
|
133
|
+
value: result.headings.length,
|
|
134
|
+
},
|
|
135
|
+
images: {
|
|
136
|
+
value: result.images.length,
|
|
137
|
+
},
|
|
138
|
+
table: {
|
|
139
|
+
value: result.tables.length,
|
|
140
|
+
note: [
|
|
141
|
+
'| r | c | h | f | m |',
|
|
142
|
+
...result.tables.map((table) => {
|
|
143
|
+
return `|${table.rows.toString(10).padStart(3, ' ')}| ${table.cols
|
|
144
|
+
.toString(10)
|
|
145
|
+
.padStart(3, ' ')} | ${table.hasHeader ? 'o' : 'x'} | ${table.hasFooter ? 'o' : 'x'} | ${table.hasMergedCell ? 'o' : 'x'} |`;
|
|
146
|
+
}),
|
|
147
|
+
].join('\n'),
|
|
148
|
+
},
|
|
149
|
+
},
|
|
150
|
+
};
|
|
151
|
+
},
|
|
152
|
+
};
|
|
153
|
+
});
|
|
154
|
+
/**
|
|
155
|
+
* Strips all whitespace from the given text.
|
|
156
|
+
*
|
|
157
|
+
* Used for word-count calculation: Japanese and CJK text do not use
|
|
158
|
+
* spaces between words, so total character count (after removing
|
|
159
|
+
* formatting whitespace) is a more meaningful metric than word count.
|
|
160
|
+
* @param text - Raw text content (may be `null` for empty elements).
|
|
161
|
+
* @returns The text with all whitespace removed, or `null` if input is empty/null.
|
|
162
|
+
*/
|
|
163
|
+
function removeSpaces(text) {
|
|
164
|
+
return text?.trim().replaceAll(/\s+/g, '') || null;
|
|
165
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@nitpicker/analyze-main-contents",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"description": "Nitpicker plugin for main content detection and extraction",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "Apache-2.0",
|
|
@@ -18,8 +18,8 @@
|
|
|
18
18
|
"type": "module",
|
|
19
19
|
"exports": {
|
|
20
20
|
".": {
|
|
21
|
-
"import": "./lib/
|
|
22
|
-
"types": "./lib/
|
|
21
|
+
"import": "./lib/main-contents-plugin.js",
|
|
22
|
+
"types": "./lib/main-contents-plugin.d.ts"
|
|
23
23
|
}
|
|
24
24
|
},
|
|
25
25
|
"scripts": {
|
|
@@ -28,9 +28,9 @@
|
|
|
28
28
|
},
|
|
29
29
|
"dependencies": {
|
|
30
30
|
"@medv/finder": "4.0.2",
|
|
31
|
-
"@nitpicker/core": "0.
|
|
32
|
-
"@nitpicker/types": "0.
|
|
31
|
+
"@nitpicker/core": "0.6.0",
|
|
32
|
+
"@nitpicker/types": "0.6.0",
|
|
33
33
|
"jsdom": "28.1.0"
|
|
34
34
|
},
|
|
35
|
-
"gitHead": "
|
|
35
|
+
"gitHead": "eab407f5e4b58fa3c122001d3c034488e7f6da11"
|
|
36
36
|
}
|