@nitpicker/analyze-main-contents 0.6.3 → 0.6.5-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nitpicker/analyze-main-contents",
3
- "version": "0.6.3",
3
+ "version": "0.6.5-alpha.0",
4
4
  "description": "Nitpicker plugin for main content detection and extraction",
5
5
  "author": "D-ZERO",
6
6
  "license": "Apache-2.0",
@@ -28,9 +28,9 @@
28
28
  },
29
29
  "dependencies": {
30
30
  "@medv/finder": "4.0.2",
31
- "@nitpicker/core": "0.6.3",
32
- "@nitpicker/types": "0.6.3",
31
+ "@nitpicker/core": "0.6.5-alpha.0",
32
+ "@nitpicker/types": "0.6.5-alpha.0",
33
33
  "jsdom": "28.1.0"
34
34
  },
35
- "gitHead": "4b58ca83e5f48833b22889ed46d96984938fa1c9"
35
+ "gitHead": "e084aba5a0887a80059ff8aa0608f61a58cc9288"
36
36
  }
package/lib/index.d.ts DELETED
@@ -1,48 +0,0 @@
1
- /**
2
- * Plugin options for the main-contents detection analysis.
3
- */
4
- type Options = {
5
- /**
6
- * A custom CSS selector to identify the main content element.
7
- * When provided, it is prepended to the default selector list
8
- * (highest priority) so that the custom selector is tried first.
9
- */
10
- mainContentSelector?: string | null;
11
- };
12
- /**
13
- * Analyze plugin that detects the main content area of each page and
14
- * extracts structural metrics: word count, headings, images, and tables.
15
- *
16
- * ## Main content detection strategy
17
- *
18
- * The plugin uses a priority-ordered list of CSS selectors to locate
19
- * the main content element. The selectors are joined with commas into
20
- * a single `document.querySelector()` call, so the browser returns the
21
- * first matching element in DOM order among all selectors:
22
- *
23
- * 1. User-supplied `mainContentSelector` (highest priority, prepended via `unshift`)
24
- * 2. `<main>` element (semantic HTML5)
25
- * 3. `[role="main"]` (WAI-ARIA landmark)
26
- * 4. Common id/class patterns: `#main`, `.main`, `#content`, `.content`,
27
- * `#contents`, `.contents`, `#main-content`, `.main-content`,
28
- * `#main_content`, `.main_content`, `#mainContent`, `.mainContent`
29
- *
30
- * This ordering reflects real-world convention: semantic elements are
31
- * preferred over id/class-based heuristics, and the user's explicit
32
- * selector always wins.
33
- * @example
34
- * ```jsonc
35
- * // nitpicker.config.json
36
- * {
37
- * "plugins": {
38
- * "analyze": {
39
- * "@nitpicker/analyze-main-contents": {
40
- * "mainContentSelector": "#page-body"
41
- * }
42
- * }
43
- * }
44
- * }
45
- * ```
46
- */
47
- declare const _default: import("@nitpicker/core").PluginFactory<Options, "wordCount" | "headings" | "images" | "table">;
48
- export default _default;
package/lib/index.js DELETED
@@ -1,165 +0,0 @@
1
- import { finder } from '@medv/finder'; // cspell:disable-line
2
- import { definePlugin } from '@nitpicker/core';
3
- /**
4
- * Analyze plugin that detects the main content area of each page and
5
- * extracts structural metrics: word count, headings, images, and tables.
6
- *
7
- * ## Main content detection strategy
8
- *
9
- * The plugin uses a priority-ordered list of CSS selectors to locate
10
- * the main content element. The selectors are joined with commas into
11
- * a single `document.querySelector()` call, so the browser returns the
12
- * first matching element in DOM order among all selectors:
13
- *
14
- * 1. User-supplied `mainContentSelector` (highest priority, prepended via `unshift`)
15
- * 2. `<main>` element (semantic HTML5)
16
- * 3. `[role="main"]` (WAI-ARIA landmark)
17
- * 4. Common id/class patterns: `#main`, `.main`, `#content`, `.content`,
18
- * `#contents`, `.contents`, `#main-content`, `.main-content`,
19
- * `#main_content`, `.main_content`, `#mainContent`, `.mainContent`
20
- *
21
- * This ordering reflects real-world convention: semantic elements are
22
- * preferred over id/class-based heuristics, and the user's explicit
23
- * selector always wins.
24
- * @example
25
- * ```jsonc
26
- * // nitpicker.config.json
27
- * {
28
- * "plugins": {
29
- * "analyze": {
30
- * "@nitpicker/analyze-main-contents": {
31
- * "mainContentSelector": "#page-body"
32
- * }
33
- * }
34
- * }
35
- * }
36
- * ```
37
- */
38
- export default definePlugin((options) => {
39
- return {
40
- label: 'メインコンテンツ検出',
41
- headers: {
42
- wordCount: 'Word count',
43
- headings: 'Heading count',
44
- images: 'Image count',
45
- table: 'Table count',
46
- },
47
- eachPage({ window }) {
48
- const { document } = window;
49
- const result = {
50
- title: document.title?.trim(),
51
- main: null,
52
- wordCount: 0,
53
- headings: [],
54
- images: [],
55
- paragraphs: [],
56
- tables: [],
57
- };
58
- // Extract main content using priority-ordered selectors
59
- const selectors = [
60
- 'main',
61
- '[role="main"]',
62
- '#main',
63
- '.main',
64
- '#content',
65
- '.content',
66
- '#contents',
67
- '.contents',
68
- '#main-content',
69
- '.main-content',
70
- '#main_content',
71
- '.main_content',
72
- '#mainContent',
73
- '.mainContent',
74
- ];
75
- if (options.mainContentSelector) {
76
- selectors.unshift(options.mainContentSelector);
77
- }
78
- const $main = document.querySelector(selectors.join(','));
79
- if (!$main) {
80
- return {
81
- wordCount: {
82
- value: result.wordCount,
83
- },
84
- headings: {
85
- value: result.headings.length,
86
- },
87
- images: {
88
- value: result.images.length,
89
- },
90
- table: {
91
- value: result.tables.length,
92
- },
93
- };
94
- }
95
- result.main = {
96
- nodeName: $main.nodeName,
97
- id: $main.id || null,
98
- classList: [...$main.classList],
99
- role: $main.getAttribute('role'),
100
- selector: finder($main, {
101
- root: document.body,
102
- }),
103
- };
104
- const textContent = removeSpaces($main.textContent) || '';
105
- result.wordCount = textContent.length;
106
- for (const $heading of $main.querySelectorAll('h1, h2, h3, h4, h5, h6')) {
107
- result.headings.push({
108
- text: removeSpaces($heading.textContent),
109
- level: Number.parseInt($heading.nodeName.replace(/h/i, ''), 10),
110
- });
111
- }
112
- for (const $img of $main.querySelectorAll('img, input[type="image"]')) {
113
- result.images.push({
114
- src: $img.src,
115
- alt: $img.alt,
116
- });
117
- }
118
- for (const $table of $main.querySelectorAll('table')) {
119
- result.tables.push({
120
- rows: $table.querySelectorAll('tr').length,
121
- cols: $table.querySelector('tr')?.querySelectorAll('th, td').length || 0,
122
- hasHeader: !!$table.querySelector('thead'),
123
- hasFooter: !!$table.querySelector('tfoot'),
124
- hasMergedCell: !!$table.querySelector('[colspan], [rowspan]'),
125
- });
126
- }
127
- return {
128
- page: {
129
- wordCount: {
130
- value: result.wordCount,
131
- },
132
- headings: {
133
- value: result.headings.length,
134
- },
135
- images: {
136
- value: result.images.length,
137
- },
138
- table: {
139
- value: result.tables.length,
140
- note: [
141
- '| r | c | h | f | m |',
142
- ...result.tables.map((table) => {
143
- return `|${table.rows.toString(10).padStart(3, ' ')}| ${table.cols
144
- .toString(10)
145
- .padStart(3, ' ')} | ${table.hasHeader ? 'o' : 'x'} | ${table.hasFooter ? 'o' : 'x'} | ${table.hasMergedCell ? 'o' : 'x'} |`;
146
- }),
147
- ].join('\n'),
148
- },
149
- },
150
- };
151
- },
152
- };
153
- });
154
- /**
155
- * Strips all whitespace from the given text.
156
- *
157
- * Used for word-count calculation: Japanese and CJK text do not use
158
- * spaces between words, so total character count (after removing
159
- * formatting whitespace) is a more meaningful metric than word count.
160
- * @param text - Raw text content (may be `null` for empty elements).
161
- * @returns The text with all whitespace removed, or `null` if input is empty/null.
162
- */
163
- function removeSpaces(text) {
164
- return text?.trim().replaceAll(/\s+/g, '') || null;
165
- }