@markuplint/spec-generator 4.8.0 → 4.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.ja.md +178 -0
- package/ARCHITECTURE.md +178 -0
- package/CHANGELOG.md +9 -5
- package/README.md +29 -41
- package/SKILL.md +134 -0
- package/docs/maintenance.ja.md +212 -0
- package/docs/maintenance.md +212 -0
- package/docs/modules.ja.md +252 -0
- package/docs/modules.md +252 -0
- package/docs/scraping.ja.md +320 -0
- package/docs/scraping.md +320 -0
- package/lib/aria.d.ts +6 -0
- package/lib/aria.js +45 -0
- package/lib/fetch.d.ts +21 -0
- package/lib/fetch.js +28 -1
- package/lib/global-attrs.d.ts +6 -0
- package/lib/global-attrs.js +6 -0
- package/lib/html-elements.d.ts +8 -0
- package/lib/html-elements.js +31 -9
- package/lib/index.d.ts +22 -0
- package/lib/index.js +15 -0
- package/lib/read-json.d.ts +18 -0
- package/lib/read-json.js +18 -0
- package/lib/scraping.d.ts +15 -0
- package/lib/scraping.js +52 -0
- package/lib/svg.d.ts +7 -0
- package/lib/svg.js +7 -0
- package/lib/utils.d.ts +59 -0
- package/lib/utils.js +56 -0
- package/package.json +7 -7
package/docs/scraping.md
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
# Scraping Details
|
|
2
|
+
|
|
3
|
+
This document describes the web scraping targets, CSS selectors, caching strategy, and error handling used by `@markuplint/spec-generator`. The build is network-dependent, issuing 200+ HTTP requests to MDN and W3C specifications.
|
|
4
|
+
|
|
5
|
+
## MDN Element Scraping
|
|
6
|
+
|
|
7
|
+
**Module:** `scraping.ts`
|
|
8
|
+
|
|
9
|
+
### URL Patterns
|
|
10
|
+
|
|
11
|
+
HTML elements:
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/<name>
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
SVG elements:
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
https://developer.mozilla.org/en-US/docs/Web/SVG/Reference/Element/<name>
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
**Special case:** Heading elements (`h1`-`h6`) are mapped to a single page:
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/Heading_Elements
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Extracted Data
|
|
30
|
+
|
|
31
|
+
For each element, `fetchHTMLElement()` extracts:
|
|
32
|
+
|
|
33
|
+
| Data | Selector / Method | Notes |
|
|
34
|
+
| ------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------ |
|
|
35
|
+
| Description | `main#content .reference-layout__header .content-section` | Text content, whitespace-normalized |
|
|
36
|
+
| Compatibility | `.bc-table tbody tr:first-child th` (icons) | Falls back to notecard-based indicators if BC table is unavailable |
|
|
37
|
+
| Categories | `#technical_summary ~ figure.table-container > table` ("Content categories") | Matched against known category keywords |
|
|
38
|
+
| Attributes | `.content-section[aria-labelledby="<id>"] > dl > dt` | Parsed from definition lists in multiple sections |
|
|
39
|
+
|
|
40
|
+
### Compatibility Flag Detection
|
|
41
|
+
|
|
42
|
+
Two strategies are used, depending on whether the browser compatibility table is available:
|
|
43
|
+
|
|
44
|
+
**Strategy 1: Browser Compatibility Table** (when `<code>` in the first row matches the element name)
|
|
45
|
+
|
|
46
|
+
| Flag | Selector within `tbody tr:first-child th` |
|
|
47
|
+
| -------------- | ----------------------------------------- |
|
|
48
|
+
| `experimental` | `.ic-experimental` |
|
|
49
|
+
| `obsolete` | `.ic-obsolete` |
|
|
50
|
+
| `deprecated` | `.ic-deprecated` |
|
|
51
|
+
| `nonStandard` | `.ic-non-standard` |
|
|
52
|
+
|
|
53
|
+
**Strategy 2: Fallback indicators** (when BC table is missing or doesn't match)
|
|
54
|
+
|
|
55
|
+
| Flag | Selector |
|
|
56
|
+
| -------------- | ------------------------------------------------------------------------------------------------ |
|
|
57
|
+
| `experimental` | `.blockIndicator.experimental` or `> div .notecard.experimental` |
|
|
58
|
+
| `obsolete` | `.obsoleteHeader` or `h1` text contains "obsolete" or `> div:first-child .notecard.obsolete` |
|
|
59
|
+
| `deprecated` | `.deprecatedHeader` or `> div:first-child .notecard.deprecated` or `h1 + * .notecard.deprecated` |
|
|
60
|
+
| `nonStandard` | `.nonStandardHeader` or `h4#Non-standard` |
|
|
61
|
+
|
|
62
|
+
### Content Category Parsing
|
|
63
|
+
|
|
64
|
+
The "Content categories" property is extracted from the technical summary table. The text is matched against these keywords (case-insensitive):
|
|
65
|
+
|
|
66
|
+
| Keyword | Category |
|
|
67
|
+
| --------------------- | -------------------- |
|
|
68
|
+
| `metadata content` | `#metadata` |
|
|
69
|
+
| `flow content` | `#flow` |
|
|
70
|
+
| `sectioning content` | `#sectioning` |
|
|
71
|
+
| `heading content` | `#heading` |
|
|
72
|
+
| `phrasing content` | `#phrasing` |
|
|
73
|
+
| `embedded content` | `#embedded` |
|
|
74
|
+
| `interactive content` | `#interactive` |
|
|
75
|
+
| `palpable content` | `#palpable` |
|
|
76
|
+
| `script-supporting` | `#script-supporting` |
|
|
77
|
+
|
|
78
|
+
### Attribute Extraction
|
|
79
|
+
|
|
80
|
+
Attributes are extracted from up to 5 sections identified by `aria-labelledby` IDs:
|
|
81
|
+
|
|
82
|
+
| Section ID | Status Flags Applied |
|
|
83
|
+
| ------------------------- | ------------------------------- |
|
|
84
|
+
| `attributes` | Per-attribute flags from icons |
|
|
85
|
+
| `deprecated_attributes` | `deprecated: true` from heading |
|
|
86
|
+
| `individual_attributes` | Per-attribute flags from icons |
|
|
87
|
+
| `non-standard_attributes` | Per-attribute flags from icons |
|
|
88
|
+
| `obsolete_attributes` | `obsolete: true` from heading |
|
|
89
|
+
|
|
90
|
+
For each `<dt>` entry:
|
|
91
|
+
|
|
92
|
+
1. Extract attribute name from `<code>` text
|
|
93
|
+
2. Extract description from the next `<dd>` sibling(s)
|
|
94
|
+
3. Detect status flags from icon classes:
|
|
95
|
+
- `.icon-beaker`, `.icon.experimental`, `.icon.icon-experimental` -- experimental
|
|
96
|
+
- `.icon-trash`, `.icon.obsolete`, `.icon.icon-obsolete`, `.obsolete` -- obsolete
|
|
97
|
+
- `.icon-thumbs-down-alt`, `.icon.deprecated`, `.icon.icon-deprecated` -- deprecated
|
|
98
|
+
- `.icon-warning-sign`, `.icon.non-standard`, `.icon.icon-nonstandard` -- non-standard
|
|
99
|
+
4. Check heading context (`getItsHeading()`) for section-level flags
|
|
100
|
+
|
|
101
|
+
All extracted attributes are merged and sorted by key.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## MDN SVG Index Scraping
|
|
106
|
+
|
|
107
|
+
**Module:** `svg.ts`
|
|
108
|
+
|
|
109
|
+
### Target
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
https://developer.mozilla.org/en-US/docs/Web/SVG/Element
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Extraction Process
|
|
116
|
+
|
|
117
|
+
1. Unwrap all `<section>` elements (replace with children) to flatten the document structure
|
|
118
|
+
2. Find the heading with `id="obsolete_and_deprecated_elements"`
|
|
119
|
+
3. Use `getThisOutline()` to collect all siblings until the next `<h2>`
|
|
120
|
+
4. Extract element names from `div > a` elements, stripping angle brackets
|
|
121
|
+
5. Prefix each name with `svg_` (e.g., `altGlyph` becomes `svg_altGlyph`)
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## WAI-ARIA Scraping
|
|
126
|
+
|
|
127
|
+
**Module:** `aria.ts`
|
|
128
|
+
|
|
129
|
+
### Specification URLs
|
|
130
|
+
|
|
131
|
+
| Version | URL | Status |
|
|
132
|
+
| ------- | ------------------------------------- | -------------- |
|
|
133
|
+
| 1.1 | `https://www.w3.org/TR/wai-aria-1.1/` | Recommendation |
|
|
134
|
+
| 1.2 | `https://www.w3.org/TR/wai-aria-1.2/` | Recommendation |
|
|
135
|
+
| 1.3 | `https://w3c.github.io/aria/` | Working Draft |
|
|
136
|
+
|
|
137
|
+
### Role Extraction
|
|
138
|
+
|
|
139
|
+
**Selector:** `#role_definitions section.role`
|
|
140
|
+
|
|
141
|
+
For each role section:
|
|
142
|
+
|
|
143
|
+
| Data | Selector |
|
|
144
|
+
| ---------------------------- | --------------------------------------------------- |
|
|
145
|
+
| Name | `.role-name[title]` |
|
|
146
|
+
| Description | `.role-description p` (joined with `\n\n`) |
|
|
147
|
+
| Is Abstract | `.role-abstract` text equals "true" |
|
|
148
|
+
| Generalization | `.role-parent a` |
|
|
149
|
+
| Required Properties | `.role-required-properties li` (fallback to parent) |
|
|
150
|
+
| Inherited Properties | `.role-inherited li` |
|
|
151
|
+
| Owned Properties | `.role-properties li` or `.role-properties > a` |
|
|
152
|
+
| Required Context Role | `.role-scope li` or `.role-scope a` |
|
|
153
|
+
| Required Owned Elements | `.role-mustcontain li` or `.role-mustcontain a` |
|
|
154
|
+
| Accessible Name Required | `.role-namerequired` contains "true" |
|
|
155
|
+
| Accessible Name From Author | `.role-namefrom` contains "author" |
|
|
156
|
+
| Accessible Name From Content | `.role-namefrom` contains "content" |
|
|
157
|
+
| Accessible Name Prohibited | `.role-namefrom` contains "prohibited" |
|
|
158
|
+
| Children Presentational | `.role-childpresentational` "true"/"false" |
|
|
159
|
+
| Prohibited Properties | `.role-disallowed li code` |
|
|
160
|
+
|
|
161
|
+
**Role synonym handling:**
|
|
162
|
+
|
|
163
|
+
- ARIA 1.1/1.2: `none` inherits properties from `presentation`
|
|
164
|
+
- ARIA 1.3: `presentation` inherits from `none`; `img` inherits from `image`
|
|
165
|
+
|
|
166
|
+
### Property/State Extraction
|
|
167
|
+
|
|
168
|
+
Properties are discovered from the `ownedProperties` of all scraped roles. For each property:
|
|
169
|
+
|
|
170
|
+
**Selector base:** `#<property-name>` (e.g., `#aria-label`)
|
|
171
|
+
|
|
172
|
+
| Data | Selector |
|
|
173
|
+
| ------------------ | -------------------------------------------------------------------------------------- |
|
|
174
|
+
| Type | Section class: `/property/i` matches → `"property"`, else `"state"` |
|
|
175
|
+
| Deprecated | Section class contains "deprecated" |
|
|
176
|
+
| Value type | `table .${type}-value` or `table .property-value` or `.state-features .property-value` |
|
|
177
|
+
| Value descriptions | `table:is(.value-descriptions, .def:has(.value-description)) tbody tr` |
|
|
178
|
+
| Enum values | From `.value-name` elements (only for `token` or `token list` value types) |
|
|
179
|
+
| Default value | `.value-name .default` text |
|
|
180
|
+
| Is Global | Listed in `#global_states li a` |
|
|
181
|
+
|
|
182
|
+
**Conditional value overrides:**
|
|
183
|
+
|
|
184
|
+
- `aria-checked`: Value set to `"true/false"` with conditional `"tristate"` for `checkbox` and `menuitemcheckbox` roles
|
|
185
|
+
- `aria-hidden`: The `hidden` HTML attribute equivalent is marked as `isNotStrictEquivalent`
|
|
186
|
+
|
|
187
|
+
### Global States/Properties
|
|
188
|
+
|
|
189
|
+
Global ARIA attributes are identified by collecting all `<a>` links under `#global_states li`. The hash fragment of each link is used as the property name.
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## Graphics ARIA Scraping
|
|
194
|
+
|
|
195
|
+
**Module:** `aria.ts`
|
|
196
|
+
|
|
197
|
+
Graphics ARIA roles are fetched using the same `getRoles()` function with `graphicsAria = true`.
|
|
198
|
+
|
|
199
|
+
| Version | URL |
|
|
200
|
+
| ------- | ------------------------------------------ |
|
|
201
|
+
| 1.1 | `https://www.w3.org/TR/graphics-aria-1.0/` |
|
|
202
|
+
| 1.2 | `https://w3c.github.io/graphics-aria/` |
|
|
203
|
+
| 1.3 | `https://w3c.github.io/graphics-aria/` |
|
|
204
|
+
|
|
205
|
+
The same CSS selectors used for standard ARIA roles apply to Graphics ARIA roles.
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## HTML-ARIA Mapping
|
|
210
|
+
|
|
211
|
+
**Module:** `aria.ts` (`getAriaInHtml()`)
|
|
212
|
+
|
|
213
|
+
### Target
|
|
214
|
+
|
|
215
|
+
```
|
|
216
|
+
https://www.w3.org/TR/html-aria/
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### Selector
|
|
220
|
+
|
|
221
|
+
```
|
|
222
|
+
#requirements-for-use-of-aria-attributes-in-place-of-equivalent-html-attributes table tbody tr
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
For each row:
|
|
226
|
+
|
|
227
|
+
- HTML attribute name: `th:nth-of-type(1) a` (first link text)
|
|
228
|
+
- Implicit ARIA property: `td:nth-of-type(1) code` (first code element text)
|
|
229
|
+
- The property string is split on `=` to get the ARIA property name and value
|
|
230
|
+
|
|
231
|
+
**Skipped:** The `contenteditable` attribute is excluded because it requires ancestor evaluation.
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Caching
|
|
236
|
+
|
|
237
|
+
### In-Process Cache
|
|
238
|
+
|
|
239
|
+
Two `Map` caches exist in `fetch.ts`:
|
|
240
|
+
|
|
241
|
+
| Cache | Key | Value | Scope |
|
|
242
|
+
| ---------- | --- | --------------- | ----------------------------------- |
|
|
243
|
+
| `cache` | URL | Raw HTML string | Single build run (process lifetime) |
|
|
244
|
+
| `domCache` | URL | `CheerioAPI` | Single build run (process lifetime) |
|
|
245
|
+
|
|
246
|
+
- The same URL is never fetched twice within a single build
|
|
247
|
+
- Failed fetches are cached as empty strings, preventing retry
|
|
248
|
+
- There is **no persistence between builds** -- every `yarn up:gen` fetches all URLs fresh
|
|
249
|
+
|
|
250
|
+
### Cache Behavior on Failure
|
|
251
|
+
|
|
252
|
+
When `globalThis.fetch()` throws:
|
|
253
|
+
|
|
254
|
+
1. An empty string is cached for that URL
|
|
255
|
+
2. The build continues (does not abort)
|
|
256
|
+
3. Elements whose pages failed to fetch will have empty/missing metadata
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Error Handling
|
|
261
|
+
|
|
262
|
+
| Scenario | Behavior |
|
|
263
|
+
| --------------------- | ------------------------------------------------------------------- |
|
|
264
|
+
| HTTP fetch failure | Empty string cached, build continues, metadata is empty |
|
|
265
|
+
| Missing DOM element | Cheerio returns empty selection, fields default to empty |
|
|
266
|
+
| MDN page restructured | CSS selectors fail silently, data is lost for affected elements |
|
|
267
|
+
| W3C spec URL changed | Fetch returns error page HTML, scraping extracts garbage or nothing |
|
|
268
|
+
|
|
269
|
+
The generator does not validate scraped data against expected shapes. Incorrect or missing data will propagate silently into `index.json`.
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Known Fragile Points
|
|
274
|
+
|
|
275
|
+
These CSS selectors are sensitive to upstream page structure changes:
|
|
276
|
+
|
|
277
|
+
### MDN Pages
|
|
278
|
+
|
|
279
|
+
| Selector | Used For | Risk Level |
|
|
280
|
+
| --------------------------------------------------------------- | ------------------- | ---------- |
|
|
281
|
+
| `main#content` | Main article | Low |
|
|
282
|
+
| `.reference-layout__header .content-section` | Description | Medium |
|
|
283
|
+
| `.bc-table tbody tr:first-child th` | Compatibility flags | Medium |
|
|
284
|
+
| `#technical_summary ~ figure.table-container > table` | Technical summary | High |
|
|
285
|
+
| `.content-section[aria-labelledby="attributes"]` | Attribute section | Medium |
|
|
286
|
+
| `.icon-beaker`, `.icon.experimental`, `.icon.icon-experimental` | Experimental flag | High |
|
|
287
|
+
| `.icon-trash`, `.icon.obsolete`, `.icon.icon-obsolete` | Obsolete flag | High |
|
|
288
|
+
|
|
289
|
+
### W3C ARIA Spec Pages
|
|
290
|
+
|
|
291
|
+
| Selector | Used For | Risk Level |
|
|
292
|
+
| -------------------------------- | ------------------- | ---------- |
|
|
293
|
+
| `#role_definitions section.role` | Role sections | Low |
|
|
294
|
+
| `.role-name[title]` | Role name | Low |
|
|
295
|
+
| `.role-required-properties li` | Required properties | Low |
|
|
296
|
+
| `.role-properties li` | Owned properties | Low |
|
|
297
|
+
| `#global_states li a` | Global properties | Low |
|
|
298
|
+
|
|
299
|
+
W3C specs use more structured class names, making them less likely to change than MDN selectors.
|
|
300
|
+
|
|
301
|
+
---
|
|
302
|
+
|
|
303
|
+
## Diagnosing Scraping Failures
|
|
304
|
+
|
|
305
|
+
After running `yarn up:gen`, check the diff of `index.json`:
|
|
306
|
+
|
|
307
|
+
```bash
|
|
308
|
+
git diff packages/@markuplint/html-spec/index.json
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
**Symptoms of scraping failure:**
|
|
312
|
+
|
|
313
|
+
- **Massive data loss** -- Large chunks of specification data disappear from `index.json`. This almost certainly indicates a scraping failure, not an actual spec change.
|
|
314
|
+
- **Empty descriptions** -- Multiple elements suddenly have empty `description` fields
|
|
315
|
+
- **Missing attributes** -- Attributes that were previously present are gone
|
|
316
|
+
- **Empty ARIA data** -- Role or property definitions are empty or significantly reduced
|
|
317
|
+
|
|
318
|
+
**Root cause:** The referenced site (MDN, W3C) has changed its HTML structure, information layout, or element IDs/classes.
|
|
319
|
+
|
|
320
|
+
**Resolution:** Identify which module's CSS selectors are broken by inspecting the actual page structure, then update the selectors in `scraping.ts` or `aria.ts` to match the new structure. Re-run `yarn up:gen` to verify.
|
package/lib/aria.d.ts
CHANGED
|
@@ -1,4 +1,10 @@
|
|
|
1
1
|
import type { ARIAProperty, ARIARoleInSchema } from '@markuplint/ml-spec';
|
|
2
|
+
/**
|
|
3
|
+
* Fetches and assembles the complete ARIA specification data for all supported versions (1.1, 1.2, 1.3).
|
|
4
|
+
* For each version, gathers roles, properties/states, and graphics ARIA roles by scraping the W3C specs.
|
|
5
|
+
*
|
|
6
|
+
* @returns An object keyed by ARIA version, each containing `roles`, `props`, and `graphicsRoles`
|
|
7
|
+
*/
|
|
2
8
|
export declare function getAria(): Promise<{
|
|
3
9
|
'1.3': {
|
|
4
10
|
roles: ARIARoleInSchema[];
|
package/lib/aria.js
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
import { fetch } from './fetch.js';
|
|
2
2
|
import { arrayUnique, nameCompare } from './utils.js';
|
|
3
|
+
/**
|
|
4
|
+
* Fetches and assembles the complete ARIA specification data for all supported versions (1.1, 1.2, 1.3).
|
|
5
|
+
* For each version, gathers roles, properties/states, and graphics ARIA roles by scraping the W3C specs.
|
|
6
|
+
*
|
|
7
|
+
* @returns An object keyed by ARIA version, each containing `roles`, `props`, and `graphicsRoles`
|
|
8
|
+
*/
|
|
3
9
|
export async function getAria() {
|
|
4
10
|
const roles13 = await getRoles('1.3');
|
|
5
11
|
const roles12 = await getRoles('1.2');
|
|
@@ -22,6 +28,13 @@ export async function getAria() {
|
|
|
22
28
|
},
|
|
23
29
|
};
|
|
24
30
|
}
|
|
31
|
+
/**
|
|
32
|
+
* Returns the URL of the WAI-ARIA specification for a given version.
|
|
33
|
+
*
|
|
34
|
+
* @param version - The ARIA specification version
|
|
35
|
+
* @param graphicsAria - Whether to return the Graphics ARIA module URL instead
|
|
36
|
+
* @returns The specification URL string
|
|
37
|
+
*/
|
|
25
38
|
function getARIASpecURLByVersion(version, graphicsAria = false) {
|
|
26
39
|
switch (version) {
|
|
27
40
|
case '1.3': {
|
|
@@ -44,6 +57,15 @@ function getARIASpecURLByVersion(version, graphicsAria = false) {
|
|
|
44
57
|
}
|
|
45
58
|
}
|
|
46
59
|
}
|
|
60
|
+
/**
|
|
61
|
+
* Scrapes ARIA role definitions from the W3C specification page for a given version.
|
|
62
|
+
* Extracts role metadata including generalization, owned properties, required context,
|
|
63
|
+
* accessible name requirements, and more. Handles role synonyms (e.g., `none`/`presentation`).
|
|
64
|
+
*
|
|
65
|
+
* @param version - The ARIA specification version to scrape
|
|
66
|
+
* @param graphicsAria - Whether to scrape the Graphics ARIA module instead
|
|
67
|
+
* @returns A sorted array of ARIA role schema objects
|
|
68
|
+
*/
|
|
47
69
|
async function getRoles(version, graphicsAria = false) {
|
|
48
70
|
const $ = await fetch(getARIASpecURLByVersion(version, graphicsAria));
|
|
49
71
|
const $roleList = $('#role_definitions section.role');
|
|
@@ -178,6 +200,15 @@ async function getRoles(version, graphicsAria = false) {
|
|
|
178
200
|
roles.sort(nameCompare);
|
|
179
201
|
return roles;
|
|
180
202
|
}
|
|
203
|
+
/**
|
|
204
|
+
* Scrapes ARIA properties and states from the specification for a given version.
|
|
205
|
+
* Builds a list of all ARIA properties referenced by the provided roles, enriches each
|
|
206
|
+
* with value types, enum values, default values, global status, and equivalent HTML attributes.
|
|
207
|
+
*
|
|
208
|
+
* @param version - The ARIA specification version to scrape
|
|
209
|
+
* @param roles - The array of role definitions used to discover which properties exist
|
|
210
|
+
* @returns A sorted array of ARIA property definitions
|
|
211
|
+
*/
|
|
181
212
|
async function getProps(version, roles) {
|
|
182
213
|
const $ = await fetch(getARIASpecURLByVersion(version));
|
|
183
214
|
const ariaNameList = new Set();
|
|
@@ -278,6 +309,12 @@ async function getProps(version, roles) {
|
|
|
278
309
|
arias.sort(nameCompare);
|
|
279
310
|
return arias;
|
|
280
311
|
}
|
|
312
|
+
/**
|
|
313
|
+
* Scrapes the W3C HTML-ARIA specification to extract the mapping between
|
|
314
|
+
* HTML attributes and their equivalent implicit ARIA properties.
|
|
315
|
+
*
|
|
316
|
+
* @returns An object containing `implicitProps`, an array of mappings from HTML attribute names to ARIA property names and values
|
|
317
|
+
*/
|
|
281
318
|
async function getAriaInHtml() {
|
|
282
319
|
const $ = await fetch('https://www.w3.org/TR/html-aria/');
|
|
283
320
|
const implicitProps = [];
|
|
@@ -306,6 +343,14 @@ async function getAriaInHtml() {
|
|
|
306
343
|
implicitProps,
|
|
307
344
|
};
|
|
308
345
|
}
|
|
346
|
+
/**
|
|
347
|
+
* Tries multiple CSS selectors on a Cheerio element and returns the first non-empty match.
|
|
348
|
+
* Falls back to the last selector's result if none match.
|
|
349
|
+
*
|
|
350
|
+
* @param $el - The Cheerio element to search within
|
|
351
|
+
* @param selectors - An ordered list of CSS selectors to try
|
|
352
|
+
* @returns The Cheerio selection from the first matching selector, or the last selector's (empty) result
|
|
353
|
+
*/
|
|
309
354
|
function $$(
|
|
310
355
|
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
311
356
|
$el, selectors) {
|
package/lib/fetch.d.ts
CHANGED
|
@@ -1,4 +1,25 @@
|
|
|
1
1
|
import * as cheerio from 'cheerio';
|
|
2
|
+
/**
|
|
3
|
+
* Fetches a URL and returns a parsed Cheerio DOM instance.
|
|
4
|
+
* Results are cached so subsequent calls with the same URL avoid re-fetching and re-parsing.
|
|
5
|
+
*
|
|
6
|
+
* @param url - The URL to fetch and parse as HTML
|
|
7
|
+
* @returns A Cheerio API instance for querying the fetched document
|
|
8
|
+
*/
|
|
2
9
|
export declare function fetch(url: string): Promise<cheerio.CheerioAPI>;
|
|
10
|
+
/**
|
|
11
|
+
* Fetches the raw text content of a URL.
|
|
12
|
+
* Results are cached so repeated requests for the same URL return the cached response.
|
|
13
|
+
* Updates the CLI progress bar on each call.
|
|
14
|
+
*
|
|
15
|
+
* @param url - The URL to fetch
|
|
16
|
+
* @returns The raw text content of the HTTP response, or an empty string on failure
|
|
17
|
+
*/
|
|
3
18
|
export declare function fetchText(url: string): Promise<string>;
|
|
19
|
+
/**
|
|
20
|
+
* Finalizes the fetch progress bar and returns a sorted list of all URLs that were fetched.
|
|
21
|
+
* Should be called after all fetch operations are complete.
|
|
22
|
+
*
|
|
23
|
+
* @returns A sorted array of all fetched URL strings (used as reference citations)
|
|
24
|
+
*/
|
|
4
25
|
export declare function getReferences(): string[];
|
package/lib/fetch.js
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
import * as cheerio from 'cheerio';
|
|
2
2
|
import { Bar, Presets } from 'cli-progress';
|
|
3
|
+
/**
|
|
4
|
+
* In-memory cache mapping URLs to their raw HTML text responses.
|
|
5
|
+
*/
|
|
3
6
|
const cache = new Map();
|
|
7
|
+
/**
|
|
8
|
+
* In-memory cache mapping URLs to their parsed Cheerio DOM instances.
|
|
9
|
+
*/
|
|
4
10
|
const domCache = new Map();
|
|
5
11
|
let total = 1;
|
|
6
12
|
let current = 0;
|
|
@@ -8,6 +14,13 @@ const bar = new Bar({
|
|
|
8
14
|
format: '🔎 Fetch references... {bar} {percentage}% | ETA: {eta}s | {value}/{total} {process}',
|
|
9
15
|
}, Presets.shades_grey);
|
|
10
16
|
bar.start(total, current, { process: '🚀 Started.' });
|
|
17
|
+
/**
|
|
18
|
+
* Fetches a URL and returns a parsed Cheerio DOM instance.
|
|
19
|
+
* Results are cached so subsequent calls with the same URL avoid re-fetching and re-parsing.
|
|
20
|
+
*
|
|
21
|
+
* @param url - The URL to fetch and parse as HTML
|
|
22
|
+
* @returns A Cheerio API instance for querying the fetched document
|
|
23
|
+
*/
|
|
11
24
|
export async function fetch(url) {
|
|
12
25
|
if (domCache.has(url)) {
|
|
13
26
|
return domCache.get(url);
|
|
@@ -17,10 +30,18 @@ export async function fetch(url) {
|
|
|
17
30
|
domCache.set(url, $);
|
|
18
31
|
return $;
|
|
19
32
|
}
|
|
33
|
+
/**
|
|
34
|
+
* Fetches the raw text content of a URL.
|
|
35
|
+
* Results are cached so repeated requests for the same URL return the cached response.
|
|
36
|
+
* Updates the CLI progress bar on each call.
|
|
37
|
+
*
|
|
38
|
+
* @param url - The URL to fetch
|
|
39
|
+
* @returns The raw text content of the HTTP response, or an empty string on failure
|
|
40
|
+
*/
|
|
20
41
|
export async function fetchText(url) {
|
|
21
42
|
total += 1;
|
|
22
43
|
bar.setTotal(total);
|
|
23
|
-
let text
|
|
44
|
+
let text;
|
|
24
45
|
if (cache.has(url)) {
|
|
25
46
|
text = cache.get(url);
|
|
26
47
|
}
|
|
@@ -39,6 +60,12 @@ export async function fetchText(url) {
|
|
|
39
60
|
bar.update(current, { process: `🔗 ${url.length > 30 ? `${url.slice(0, 15)}...${url.slice(-15)}` : url}` });
|
|
40
61
|
return text;
|
|
41
62
|
}
|
|
63
|
+
/**
|
|
64
|
+
* Finalizes the fetch progress bar and returns a sorted list of all URLs that were fetched.
|
|
65
|
+
* Should be called after all fetch operations are complete.
|
|
66
|
+
*
|
|
67
|
+
* @returns A sorted array of all fetched URL strings (used as reference citations)
|
|
68
|
+
*/
|
|
42
69
|
export function getReferences() {
|
|
43
70
|
current += 1;
|
|
44
71
|
bar.update(current, { process: '🎉 Finished.' });
|
package/lib/global-attrs.d.ts
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reads the global HTML attributes definition from a JSON file.
|
|
3
|
+
*
|
|
4
|
+
* @param filePath - The absolute path to the common attributes JSON file
|
|
5
|
+
* @returns The parsed global attributes specification object
|
|
6
|
+
*/
|
|
1
7
|
export declare function getGlobalAttrs(filePath: string): {
|
|
2
8
|
readonly [category: string]: Readonly<Record<string, Partial<import("@markuplint/ml-spec").Attribute>>>;
|
|
3
9
|
};
|
package/lib/global-attrs.js
CHANGED
|
@@ -1,4 +1,10 @@
|
|
|
1
1
|
import { readJson } from './read-json.js';
|
|
2
|
+
/**
|
|
3
|
+
* Reads the global HTML attributes definition from a JSON file.
|
|
4
|
+
*
|
|
5
|
+
* @param filePath - The absolute path to the common attributes JSON file
|
|
6
|
+
* @returns The parsed global attributes specification object
|
|
7
|
+
*/
|
|
2
8
|
export function getGlobalAttrs(filePath) {
|
|
3
9
|
const gAttrs = readJson(filePath);
|
|
4
10
|
return gAttrs;
|
package/lib/html-elements.d.ts
CHANGED
|
@@ -1,2 +1,10 @@
|
|
|
1
1
|
import type { ExtendedElementSpec } from '@markuplint/ml-spec';
|
|
2
|
+
/**
|
|
3
|
+
* Builds the complete list of HTML and SVG element specifications by reading local JSON spec files,
|
|
4
|
+
* enriching them with data scraped from MDN, and appending obsolete/deprecated elements.
|
|
5
|
+
* Elements are sorted alphabetically with SVG elements placed after HTML elements.
|
|
6
|
+
*
|
|
7
|
+
* @param filePattern - An absolute glob pattern matching the per-element JSON spec files
|
|
8
|
+
* @returns A sorted array of extended element specification objects
|
|
9
|
+
*/
|
|
2
10
|
export declare function getElements(filePattern: string): Promise<ExtendedElementSpec[]>;
|
package/lib/html-elements.js
CHANGED
|
@@ -3,6 +3,8 @@ import { fetchHTMLElement, fetchObsoleteElements } from './scraping.js';
|
|
|
3
3
|
import { getSVGElementList } from './svg.js';
|
|
4
4
|
import { getName, nameCompare, sortObjectByKey } from './utils.js';
|
|
5
5
|
/**
|
|
6
|
+
* List of non-conforming (obsolete) HTML elements.
|
|
7
|
+
*
|
|
6
8
|
* @see https://html.spec.whatwg.org/multipage/obsolete.html#non-conforming-features
|
|
7
9
|
*/
|
|
8
10
|
const obsoleteList = [
|
|
@@ -36,6 +38,14 @@ const obsoleteList = [
|
|
|
36
38
|
'spacer',
|
|
37
39
|
'tt',
|
|
38
40
|
];
|
|
41
|
+
/**
|
|
42
|
+
* Builds the complete list of HTML and SVG element specifications by reading local JSON spec files,
|
|
43
|
+
* enriching them with data scraped from MDN, and appending obsolete/deprecated elements.
|
|
44
|
+
* Elements are sorted alphabetically with SVG elements placed after HTML elements.
|
|
45
|
+
*
|
|
46
|
+
* @param filePattern - An absolute glob pattern matching the per-element JSON spec files
|
|
47
|
+
* @returns A sorted array of extended element specification objects
|
|
48
|
+
*/
|
|
39
49
|
export async function getElements(filePattern) {
|
|
40
50
|
let specs = await readJsons(filePattern, (file, body) => {
|
|
41
51
|
const name = file.replace(/^.+spec\.([\w-]+)\.json$/i, '$1');
|
|
@@ -88,15 +98,27 @@ export async function getElements(filePattern) {
|
|
|
88
98
|
};
|
|
89
99
|
continue;
|
|
90
100
|
}
|
|
91
|
-
if (typeof current === 'object'
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
101
|
+
if (typeof current === 'object') {
|
|
102
|
+
if ('name' in current) {
|
|
103
|
+
attrs[mdnAttr.name] = {
|
|
104
|
+
// @ts-ignore for key order that "name" is first
|
|
105
|
+
name: mdnAttr.name,
|
|
106
|
+
// @ts-ignore for key order that "description" is second
|
|
107
|
+
...mdnData.attributes,
|
|
108
|
+
// @ts-ignore
|
|
109
|
+
...current,
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
attrs[mdnAttr.name] = {
|
|
114
|
+
description: mdnAttr.description,
|
|
115
|
+
experimental: mdnAttr.experimental,
|
|
116
|
+
obsolete: mdnAttr.obsolete,
|
|
117
|
+
deprecated: mdnAttr.deprecated,
|
|
118
|
+
nonStandard: mdnAttr.nonStandard,
|
|
119
|
+
...current,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
100
122
|
}
|
|
101
123
|
}
|
|
102
124
|
return attrs;
|
package/lib/index.d.ts
CHANGED
|
@@ -1,7 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module @markuplint/spec-generator
|
|
3
|
+
*
|
|
4
|
+
* Generates the markuplint extended specification JSON file by scraping W3C and MDN web standards
|
|
5
|
+
* documentation. Aggregates HTML/SVG element specs, global attributes, ARIA roles and properties,
|
|
6
|
+
* and content model definitions into a single output file consumed by the markuplint linter.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Configuration options for the spec generator.
|
|
10
|
+
*/
|
|
1
11
|
export type Options = {
|
|
12
|
+
/** The absolute file path where the generated JSON spec will be written. */
|
|
2
13
|
readonly outputFilePath: string;
|
|
14
|
+
/** An absolute glob pattern matching per-element HTML spec JSON files. */
|
|
3
15
|
readonly htmlFilePattern: string;
|
|
16
|
+
/** The absolute file path to the common (global) attributes JSON definition. */
|
|
4
17
|
readonly commonAttrsFilePath: string;
|
|
18
|
+
/** The absolute file path to the common content models JSON definition. */
|
|
5
19
|
readonly commonContentsFilePath: string;
|
|
6
20
|
};
|
|
21
|
+
/**
|
|
22
|
+
* Main entry point for the spec generator. Fetches and aggregates all specification data
|
|
23
|
+
* (HTML/SVG elements, global attributes, ARIA definitions, content models, and reference URLs)
|
|
24
|
+
* then writes the combined result as a JSON file to the specified output path.
|
|
25
|
+
*
|
|
26
|
+
* @param options - The configuration options controlling input sources and output destination
|
|
27
|
+
* @returns A promise that resolves when the output file has been written
|
|
28
|
+
*/
|
|
7
29
|
export declare function main({ outputFilePath, htmlFilePattern, commonAttrsFilePath, commonContentsFilePath }: Options): Promise<void>;
|
package/lib/index.js
CHANGED
|
@@ -1,9 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module @markuplint/spec-generator
|
|
3
|
+
*
|
|
4
|
+
* Generates the markuplint extended specification JSON file by scraping W3C and MDN web standards
|
|
5
|
+
* documentation. Aggregates HTML/SVG element specs, global attributes, ARIA roles and properties,
|
|
6
|
+
* and content model definitions into a single output file consumed by the markuplint linter.
|
|
7
|
+
*/
|
|
1
8
|
import { writeFile } from 'node:fs/promises';
|
|
2
9
|
import { getAria } from './aria.js';
|
|
3
10
|
import { getReferences } from './fetch.js';
|
|
4
11
|
import { getGlobalAttrs } from './global-attrs.js';
|
|
5
12
|
import { getElements } from './html-elements.js';
|
|
6
13
|
import { readJson } from './read-json.js';
|
|
14
|
+
/**
|
|
15
|
+
* Main entry point for the spec generator. Fetches and aggregates all specification data
|
|
16
|
+
* (HTML/SVG elements, global attributes, ARIA definitions, content models, and reference URLs)
|
|
17
|
+
* then writes the combined result as a JSON file to the specified output path.
|
|
18
|
+
*
|
|
19
|
+
* @param options - The configuration options controlling input sources and output destination
|
|
20
|
+
* @returns A promise that resolves when the output file has been written
|
|
21
|
+
*/
|
|
7
22
|
export async function main({ outputFilePath, htmlFilePattern, commonAttrsFilePath, commonContentsFilePath }) {
|
|
8
23
|
const [specs, globalAttrs, aria] = await Promise.all([
|
|
9
24
|
getElements(htmlFilePattern),
|
package/lib/read-json.d.ts
CHANGED
|
@@ -1,2 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reads and parses a single JSON file (with support for JSON comments) from an absolute file path.
|
|
3
|
+
*
|
|
4
|
+
* @template T - The expected shape of the parsed JSON data
|
|
5
|
+
* @param filePath - The absolute file path to the JSON file
|
|
6
|
+
* @returns The parsed JSON content
|
|
7
|
+
* @throws If the provided path is not absolute
|
|
8
|
+
*/
|
|
1
9
|
export declare function readJson<T = Record<string, any>>(filePath: string): T;
|
|
10
|
+
/**
|
|
11
|
+
* Reads multiple JSON files matching a glob pattern and optionally transforms each result.
|
|
12
|
+
* All matched files are read and parsed in parallel.
|
|
13
|
+
*
|
|
14
|
+
* @template T - The expected shape of each parsed JSON file
|
|
15
|
+
* @param pattern - An absolute glob pattern to match JSON files
|
|
16
|
+
* @param hook - An optional transformation function called with each file path and its parsed body
|
|
17
|
+
* @returns An array of parsed (and optionally transformed) JSON objects
|
|
18
|
+
* @throws If the provided pattern is not an absolute path
|
|
19
|
+
*/
|
|
2
20
|
export declare function readJsons<T = Record<string, any>>(pattern: string, hook?: (fileName: string, body: T) => T | Promise<T>): Promise<T[]>;
|