jscpd-rs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/CHANGELOG.md +69 -0
  2. package/Cargo.lock +1323 -0
  3. package/Cargo.toml +54 -0
  4. package/LICENSE +21 -0
  5. package/README.md +372 -0
  6. package/docs/api-parity.md +49 -0
  7. package/docs/cloning-plan.md +281 -0
  8. package/docs/compat-baseline.md +535 -0
  9. package/docs/format-porting.md +86 -0
  10. package/docs/junior-task-template.md +62 -0
  11. package/docs/junior-workflow.md +87 -0
  12. package/docs/migrating-from-jscpd.md +193 -0
  13. package/docs/npm-release.md +116 -0
  14. package/docs/public-benchmark-suite.md +81 -0
  15. package/docs/release-checklist.md +200 -0
  16. package/docs/release-decisions.md +103 -0
  17. package/docs/release-readiness.md +51 -0
  18. package/docs/upstream-bugs.md +501 -0
  19. package/docs/upstream-issue-drafts.md +393 -0
  20. package/docs/user-guide.md +309 -0
  21. package/examples/dump_oxc_tokens.rs +112 -0
  22. package/examples/library_api.rs +42 -0
  23. package/npm/bin/jscpd-rs.js +6 -0
  24. package/npm/bin/jscpd-server.js +6 -0
  25. package/npm/lib/run-binary.js +68 -0
  26. package/npm/scripts/postinstall.js +50 -0
  27. package/package.json +53 -0
  28. package/skills/dry-refactoring/SKILL.md +63 -0
  29. package/skills/jscpd/SKILL.md +85 -0
  30. package/src/app.rs +512 -0
  31. package/src/bin/jscpd-server.rs +429 -0
  32. package/src/blame.rs +130 -0
  33. package/src/cli/config.rs +543 -0
  34. package/src/cli/parsing.rs +301 -0
  35. package/src/cli/tests.rs +543 -0
  36. package/src/cli.rs +671 -0
  37. package/src/detector/matching/secondary.rs +387 -0
  38. package/src/detector/matching.rs +274 -0
  39. package/src/detector/model.rs +190 -0
  40. package/src/detector/prepare.rs +71 -0
  41. package/src/detector/skip_local.rs +40 -0
  42. package/src/detector/statistics.rs +138 -0
  43. package/src/detector/store.rs +96 -0
  44. package/src/detector/tests.rs +238 -0
  45. package/src/detector.rs +265 -0
  46. package/src/files/discovery.rs +508 -0
  47. package/src/files/gitignore.rs +203 -0
  48. package/src/files/paths.rs +68 -0
  49. package/src/files/shebang.rs +106 -0
  50. package/src/files/tests.rs +523 -0
  51. package/src/files.rs +25 -0
  52. package/src/formats.rs +570 -0
  53. package/src/lib.rs +433 -0
  54. package/src/main.rs +26 -0
  55. package/src/report/ai.rs +125 -0
  56. package/src/report/badge.rs +238 -0
  57. package/src/report/console.rs +180 -0
  58. package/src/report/console_common.rs +37 -0
  59. package/src/report/console_full.rs +139 -0
  60. package/src/report/csv.rs +65 -0
  61. package/src/report/escape.rs +8 -0
  62. package/src/report/file_output.rs +28 -0
  63. package/src/report/html/assets.rs +47 -0
  64. package/src/report/html.rs +336 -0
  65. package/src/report/json.rs +119 -0
  66. package/src/report/markdown.rs +125 -0
  67. package/src/report/sarif.rs +302 -0
  68. package/src/report/silent.rs +22 -0
  69. package/src/report/source.rs +38 -0
  70. package/src/report/summary.rs +50 -0
  71. package/src/report/test_support.rs +133 -0
  72. package/src/report/threshold.rs +76 -0
  73. package/src/report/xcode.rs +90 -0
  74. package/src/report/xml.rs +119 -0
  75. package/src/report.rs +250 -0
  76. package/src/server/mcp.rs +942 -0
  77. package/src/server.rs +1081 -0
  78. package/src/tokenizer/apex.rs +97 -0
  79. package/src/tokenizer/blocks.rs +532 -0
  80. package/src/tokenizer/embedded.rs +106 -0
  81. package/src/tokenizer/generic.rs +511 -0
  82. package/src/tokenizer/hash.rs +27 -0
  83. package/src/tokenizer/ignore.rs +33 -0
  84. package/src/tokenizer/line_index.rs +33 -0
  85. package/src/tokenizer/markdown.rs +289 -0
  86. package/src/tokenizer/markup_attrs.rs +289 -0
  87. package/src/tokenizer/oxc/fallback.rs +275 -0
  88. package/src/tokenizer/oxc/jsx.rs +168 -0
  89. package/src/tokenizer/oxc/kind.rs +177 -0
  90. package/src/tokenizer/oxc/lexical.rs +67 -0
  91. package/src/tokenizer/oxc.rs +659 -0
  92. package/src/tokenizer/scan.rs +88 -0
  93. package/src/tokenizer/tap.rs +150 -0
  94. package/src/tokenizer/tests.rs +915 -0
  95. package/src/tokenizer.rs +328 -0
  96. package/src/verbose.rs +195 -0
@@ -0,0 +1,97 @@
1
+ use crate::cli::Options;
2
+
3
+ use super::embedded::{
4
+ assign_sequential_positions, offset_tokens, tokenize_generic_with_whitespace,
5
+ };
6
+ use super::{LineIndex, TokenMap, find_ignore_regions, tokenize_generic};
7
+
8
+ pub(super) fn tokenize_maps(
9
+ content: &str,
10
+ options: &Options,
11
+ ignore_regions: &[[usize; 2]],
12
+ ) -> Vec<TokenMap> {
13
+ let mut maps = Vec::new();
14
+ let apex_tokens = tokenize_generic(content, "apex", options, ignore_regions);
15
+ if !apex_tokens.is_empty() {
16
+ maps.push(TokenMap {
17
+ format: "apex".to_string(),
18
+ tokens: apex_tokens,
19
+ positions_assigned: false,
20
+ });
21
+ }
22
+
23
+ let sql_blocks = soql_blocks(content);
24
+ if sql_blocks.is_empty() {
25
+ return maps;
26
+ }
27
+
28
+ let line_index = LineIndex::new(content);
29
+ let mut sql_tokens = Vec::new();
30
+ for block in sql_blocks {
31
+ let inner = &content[block.start..block.end];
32
+ let inner_ignore_regions = find_ignore_regions(inner, options);
33
+ let mut tokens =
34
+ tokenize_generic_with_whitespace(inner, "sql", options, &inner_ignore_regions);
35
+ let block_start = line_index.location(block.start);
36
+ offset_tokens(&mut tokens, block.start, &block_start);
37
+ sql_tokens.extend(tokens);
38
+ }
39
+
40
+ if !sql_tokens.is_empty() {
41
+ sql_tokens.sort_by_key(|token| (token.range[0], token.range[1]));
42
+ assign_sequential_positions(&mut sql_tokens);
43
+ maps.push(TokenMap {
44
+ format: "sql".to_string(),
45
+ tokens: sql_tokens,
46
+ positions_assigned: true,
47
+ });
48
+ }
49
+
50
+ maps
51
+ }
52
+
53
+ #[derive(Clone, Copy)]
54
+ struct SoqlBlock {
55
+ start: usize,
56
+ end: usize,
57
+ }
58
+
59
+ fn soql_blocks(content: &str) -> Vec<SoqlBlock> {
60
+ let bytes = content.as_bytes();
61
+ let mut blocks = Vec::new();
62
+ let mut idx = 0usize;
63
+ while idx < bytes.len() {
64
+ if bytes[idx] != b'[' {
65
+ idx += 1;
66
+ continue;
67
+ }
68
+ let Some(end) = find_closing_bracket(bytes, idx + 1) else {
69
+ break;
70
+ };
71
+ if looks_like_soql(&content[idx + 1..end]) {
72
+ blocks.push(SoqlBlock {
73
+ start: idx,
74
+ end: end + 1,
75
+ });
76
+ }
77
+ idx = end + 1;
78
+ }
79
+ blocks
80
+ }
81
+
82
+ fn find_closing_bracket(bytes: &[u8], start: usize) -> Option<usize> {
83
+ bytes[start..]
84
+ .iter()
85
+ .position(|byte| *byte == b']')
86
+ .map(|offset| start + offset)
87
+ }
88
+
89
+ fn looks_like_soql(content: &str) -> bool {
90
+ let trimmed = content.trim_start();
91
+ trimmed
92
+ .get(..6)
93
+ .is_some_and(|prefix| prefix.eq_ignore_ascii_case("select"))
94
+ || trimmed
95
+ .get(..4)
96
+ .is_some_and(|prefix| prefix.eq_ignore_ascii_case("find"))
97
+ }
@@ -0,0 +1,532 @@
1
+ use std::collections::BTreeMap;
2
+
3
+ use crate::cli::Options;
4
+ use crate::formats;
5
+
6
+ use super::embedded::{
7
+ assign_sequential_positions, blank_ranges_preserve_newlines, offset_tokens,
8
+ tokenize_generic_with_whitespace,
9
+ };
10
+ use super::markup_attrs::{
11
+ append_inline_style_attr_tokens, find_inline_style_attrs, inline_style_attr_ranges,
12
+ };
13
+ use super::scan::line_spans;
14
+ use super::{
15
+ DetectionToken, LineIndex, TokenMap, find_ignore_regions, is_oxc_format, tokenize_generic,
16
+ tokenize_oxc_maps,
17
+ };
18
+
19
+ const MAX_BLOCK_SOURCE_LENGTH: usize = 5_000_000;
20
+
21
+ pub(super) fn tokenize_maps(
22
+ content: &str,
23
+ format: &str,
24
+ options: &Options,
25
+ ignore_regions: &[[usize; 2]],
26
+ ) -> Vec<TokenMap> {
27
+ if matches!(format, "svelte" | "astro") && content.len() > MAX_BLOCK_SOURCE_LENGTH {
28
+ return Vec::new();
29
+ }
30
+
31
+ match format {
32
+ "markup" => tokenize_markup_maps(content, options, ignore_regions),
33
+ "vue" => tokenize_vue_maps(content, options),
34
+ "svelte" => tokenize_svelte_maps(content, options, ignore_regions),
35
+ "astro" => tokenize_astro_maps(content, options, ignore_regions),
36
+ _ => Vec::new(),
37
+ }
38
+ }
39
+
40
+ fn tokenize_markup_maps(
41
+ content: &str,
42
+ options: &Options,
43
+ ignore_regions: &[[usize; 2]],
44
+ ) -> Vec<TokenMap> {
45
+ let blocks = find_tag_blocks(content, &["script", "style"]);
46
+ let inner_ranges = blocks
47
+ .iter()
48
+ .map(|block| [block.inner_start, block.inner_end])
49
+ .collect::<Vec<_>>();
50
+ let sanitized = blank_ranges_preserve_newlines(content, &inner_ranges);
51
+ let mut grouped = BTreeMap::<String, Vec<DetectionToken>>::new();
52
+ let line_index = LineIndex::new(content);
53
+ append_markup_fragment_tokens(
54
+ &mut grouped,
55
+ &sanitized,
56
+ options,
57
+ ignore_regions,
58
+ &line_index,
59
+ false,
60
+ );
61
+ for block in blocks {
62
+ let format = resolve_markup_block_format(&block);
63
+ append_offset_block_tokens(&mut grouped, content, &block, &format, options, &line_index);
64
+ }
65
+
66
+ grouped_maps(grouped)
67
+ }
68
+
69
+ fn tokenize_vue_maps(content: &str, options: &Options) -> Vec<TokenMap> {
70
+ let blocks = find_tag_blocks(content, &["template", "script", "style"]);
71
+ let mut grouped = BTreeMap::<String, Vec<DetectionToken>>::new();
72
+ let line_index = LineIndex::new(content);
73
+
74
+ for block in blocks {
75
+ let format = resolve_vue_block_format(&block);
76
+ append_offset_block_tokens(&mut grouped, content, &block, &format, options, &line_index);
77
+ }
78
+
79
+ grouped_maps(grouped)
80
+ }
81
+
82
+ fn tokenize_svelte_maps(
83
+ content: &str,
84
+ options: &Options,
85
+ ignore_regions: &[[usize; 2]],
86
+ ) -> Vec<TokenMap> {
87
+ let blocks = find_tag_blocks(content, &["script", "style"]);
88
+ let inner_ranges = blocks
89
+ .iter()
90
+ .map(|block| [block.inner_start, block.inner_end])
91
+ .collect::<Vec<_>>();
92
+ let sanitized = blank_ranges_preserve_newlines(content, &inner_ranges);
93
+ let mut grouped = BTreeMap::<String, Vec<DetectionToken>>::new();
94
+ let mut markup_tokens =
95
+ tokenize_generic_with_whitespace(&sanitized, "markup", options, ignore_regions);
96
+ grouped
97
+ .entry("markup".to_string())
98
+ .or_default()
99
+ .append(&mut markup_tokens);
100
+
101
+ let line_index = LineIndex::new(content);
102
+ for block in blocks {
103
+ let format = resolve_svelte_block_format(&block);
104
+ append_offset_block_tokens(&mut grouped, content, &block, &format, options, &line_index);
105
+ }
106
+
107
+ grouped_maps(grouped)
108
+ }
109
+
110
+ fn tokenize_astro_maps(
111
+ content: &str,
112
+ options: &Options,
113
+ ignore_regions: &[[usize; 2]],
114
+ ) -> Vec<TokenMap> {
115
+ let frontmatter = astro_frontmatter_block(content);
116
+ let blocks = find_tag_blocks(content, &["script", "style"]);
117
+ let mut grouped = BTreeMap::<String, Vec<DetectionToken>>::new();
118
+ let line_index = LineIndex::new(content);
119
+
120
+ if let Some(block) = &frontmatter {
121
+ append_offset_block_tokens(
122
+ &mut grouped,
123
+ content,
124
+ block,
125
+ "typescript",
126
+ options,
127
+ &line_index,
128
+ );
129
+ }
130
+
131
+ for block in &blocks {
132
+ let format = resolve_astro_block_format(block);
133
+ append_offset_block_tokens(&mut grouped, content, block, &format, options, &line_index);
134
+ }
135
+
136
+ let mut blank_ranges = blocks
137
+ .iter()
138
+ .map(|block| [block.inner_start, block.inner_end])
139
+ .collect::<Vec<_>>();
140
+ if let Some(block) = &frontmatter {
141
+ blank_ranges.push([block.block_start, block.block_end]);
142
+ }
143
+ let sanitized = blank_ranges_preserve_newlines(content, &blank_ranges);
144
+ let mut markup_tokens =
145
+ tokenize_generic_with_whitespace(&sanitized, "markup", options, ignore_regions);
146
+ trim_edge_whitespace_tokens(&mut markup_tokens, &sanitized);
147
+ grouped
148
+ .entry("markup".to_string())
149
+ .or_default()
150
+ .append(&mut markup_tokens);
151
+
152
+ grouped_maps(grouped)
153
+ }
154
+
155
+ fn append_offset_block_tokens(
156
+ grouped: &mut BTreeMap<String, Vec<DetectionToken>>,
157
+ content: &str,
158
+ block: &TagBlock,
159
+ format: &str,
160
+ options: &Options,
161
+ line_index: &LineIndex,
162
+ ) {
163
+ if block.inner_start >= block.inner_end {
164
+ return;
165
+ }
166
+ let inner = &content[block.inner_start..block.inner_end];
167
+ let inner_ignore_regions = find_ignore_regions(inner, options);
168
+ let inner_maps = if is_oxc_format(format) {
169
+ tokenize_oxc_maps(inner, format, options, &inner_ignore_regions)
170
+ } else if format == "markup" {
171
+ tokenize_markup_fragment_maps(inner, options, &inner_ignore_regions, true)
172
+ } else if css_like_block_format(format) {
173
+ vec![TokenMap {
174
+ format: format.to_string(),
175
+ tokens: tokenize_generic(inner, format, options, &inner_ignore_regions),
176
+ positions_assigned: false,
177
+ }]
178
+ } else {
179
+ vec![TokenMap {
180
+ format: format.to_string(),
181
+ tokens: tokenize_generic_with_whitespace(inner, format, options, &inner_ignore_regions),
182
+ positions_assigned: false,
183
+ }]
184
+ };
185
+ let inner_start = line_index.location(block.inner_start);
186
+ for mut map in inner_maps {
187
+ if map.format == format {
188
+ trim_edge_whitespace_tokens(&mut map.tokens, inner);
189
+ }
190
+ offset_tokens(&mut map.tokens, block.inner_start, &inner_start);
191
+ grouped.entry(map.format).or_default().extend(map.tokens);
192
+ }
193
+ }
194
+
195
+ fn trim_edge_whitespace_tokens(tokens: &mut Vec<DetectionToken>, content: &str) {
196
+ let Some(first_content) = tokens
197
+ .iter()
198
+ .position(|token| !token_slice(content, token).chars().all(char::is_whitespace))
199
+ else {
200
+ tokens.clear();
201
+ return;
202
+ };
203
+ let last_content = tokens
204
+ .iter()
205
+ .rposition(|token| !token_slice(content, token).chars().all(char::is_whitespace))
206
+ .unwrap_or(first_content);
207
+
208
+ if last_content + 1 < tokens.len() {
209
+ tokens.drain(last_content + 1..);
210
+ }
211
+ if first_content > 0 {
212
+ tokens.drain(..first_content);
213
+ }
214
+ }
215
+
216
+ fn token_slice<'a>(content: &'a str, token: &DetectionToken) -> &'a str {
217
+ &content[token.range[0]..token.range[1]]
218
+ }
219
+
220
+ fn css_like_block_format(format: &str) -> bool {
221
+ matches!(format, "css" | "less" | "sass" | "scss" | "stylus")
222
+ }
223
+
224
+ fn tokenize_markup_fragment_maps(
225
+ content: &str,
226
+ options: &Options,
227
+ ignore_regions: &[[usize; 2]],
228
+ keep_whitespace: bool,
229
+ ) -> Vec<TokenMap> {
230
+ let mut grouped = BTreeMap::<String, Vec<DetectionToken>>::new();
231
+ let line_index = LineIndex::new(content);
232
+ append_markup_fragment_tokens(
233
+ &mut grouped,
234
+ content,
235
+ options,
236
+ ignore_regions,
237
+ &line_index,
238
+ keep_whitespace,
239
+ );
240
+ grouped
241
+ .into_iter()
242
+ .filter_map(|(format, tokens)| {
243
+ (!tokens.is_empty()).then_some(TokenMap {
244
+ format,
245
+ tokens,
246
+ positions_assigned: false,
247
+ })
248
+ })
249
+ .collect()
250
+ }
251
+
252
+ fn append_markup_fragment_tokens(
253
+ grouped: &mut BTreeMap<String, Vec<DetectionToken>>,
254
+ content: &str,
255
+ options: &Options,
256
+ ignore_regions: &[[usize; 2]],
257
+ line_index: &LineIndex,
258
+ keep_whitespace: bool,
259
+ ) {
260
+ let style_attrs = find_inline_style_attrs(content);
261
+ let style_attr_ranges = inline_style_attr_ranges(&style_attrs);
262
+ let markup_sanitized = blank_ranges_preserve_newlines(content, &style_attr_ranges);
263
+ let mut markup_tokens = if keep_whitespace {
264
+ tokenize_generic_with_whitespace(&markup_sanitized, "markup", options, ignore_regions)
265
+ } else {
266
+ tokenize_generic(&markup_sanitized, "markup", options, ignore_regions)
267
+ };
268
+ grouped
269
+ .entry("markup".to_string())
270
+ .or_default()
271
+ .append(&mut markup_tokens);
272
+ append_inline_style_attr_tokens(
273
+ grouped,
274
+ content,
275
+ &style_attrs,
276
+ options,
277
+ ignore_regions,
278
+ line_index,
279
+ );
280
+ }
281
+
282
+ fn grouped_maps(grouped: BTreeMap<String, Vec<DetectionToken>>) -> Vec<TokenMap> {
283
+ grouped
284
+ .into_iter()
285
+ .filter_map(|(format, mut tokens)| {
286
+ if tokens.is_empty() {
287
+ return None;
288
+ }
289
+ tokens.sort_by_key(|token| (token.range[0], token.range[1]));
290
+ assign_sequential_positions(&mut tokens);
291
+ Some(TokenMap {
292
+ format,
293
+ tokens,
294
+ positions_assigned: true,
295
+ })
296
+ })
297
+ .collect()
298
+ }
299
+
300
+ fn resolve_vue_block_format(block: &TagBlock) -> String {
301
+ let lang = attr_value(&block.attrs, "lang").unwrap_or_default();
302
+ match block.tag.as_str() {
303
+ "template" => {
304
+ if !lang.is_empty() && formats::supported_formats().contains(&lang.as_str()) {
305
+ lang
306
+ } else {
307
+ "markup".to_string()
308
+ }
309
+ }
310
+ "script" => {
311
+ if matches!(lang.as_str(), "ts" | "typescript") {
312
+ "typescript".to_string()
313
+ } else {
314
+ "javascript".to_string()
315
+ }
316
+ }
317
+ "style" => match lang.as_str() {
318
+ "scss" => "scss".to_string(),
319
+ "less" => "less".to_string(),
320
+ _ => "css".to_string(),
321
+ },
322
+ _ => "markup".to_string(),
323
+ }
324
+ }
325
+
326
+ fn resolve_svelte_block_format(block: &TagBlock) -> String {
327
+ let lang = attr_value(&block.attrs, "lang").unwrap_or_default();
328
+ match block.tag.as_str() {
329
+ "script" => match lang.as_str() {
330
+ "ts" | "typescript" => "typescript".to_string(),
331
+ "" | "js" | "javascript" => "javascript".to_string(),
332
+ _ => "markup".to_string(),
333
+ },
334
+ "style" => match lang.as_str() {
335
+ "scss" | "sass" => "scss".to_string(),
336
+ "less" => "less".to_string(),
337
+ "" | "css" | "postcss" | "stylus" => "css".to_string(),
338
+ _ => "markup".to_string(),
339
+ },
340
+ _ => "markup".to_string(),
341
+ }
342
+ }
343
+
344
+ fn resolve_astro_block_format(block: &TagBlock) -> String {
345
+ let lang = attr_value(&block.attrs, "lang").unwrap_or_default();
346
+ match block.tag.as_str() {
347
+ "script" => {
348
+ if matches!(lang.as_str(), "ts" | "typescript") {
349
+ "typescript".to_string()
350
+ } else {
351
+ "javascript".to_string()
352
+ }
353
+ }
354
+ "style" => match lang.as_str() {
355
+ "scss" => "scss".to_string(),
356
+ "less" => "less".to_string(),
357
+ _ => "css".to_string(),
358
+ },
359
+ _ => "markup".to_string(),
360
+ }
361
+ }
362
+
363
+ fn resolve_markup_block_format(block: &TagBlock) -> String {
364
+ let lang = attr_value(&block.attrs, "lang")
365
+ .or_else(|| attr_value(&block.attrs, "language"))
366
+ .or_else(|| attr_value(&block.attrs, "type"))
367
+ .unwrap_or_default();
368
+ match block.tag.as_str() {
369
+ "script" => match lang.as_str() {
370
+ "ts" | "typescript" | "text/typescript" | "application/typescript" => {
371
+ "typescript".to_string()
372
+ }
373
+ _ => "javascript".to_string(),
374
+ },
375
+ "style" => match lang.as_str() {
376
+ "scss" | "text/scss" => "scss".to_string(),
377
+ "sass" | "text/sass" => "sass".to_string(),
378
+ "less" | "text/less" => "less".to_string(),
379
+ _ => "css".to_string(),
380
+ },
381
+ _ => "markup".to_string(),
382
+ }
383
+ }
384
+
385
+ #[derive(Clone, Debug)]
386
+ struct TagBlock {
387
+ tag: String,
388
+ attrs: String,
389
+ block_start: usize,
390
+ inner_start: usize,
391
+ inner_end: usize,
392
+ block_end: usize,
393
+ }
394
+
395
+ fn find_tag_blocks(content: &str, tags: &[&'static str]) -> Vec<TagBlock> {
396
+ let lower = content.to_ascii_lowercase();
397
+ let mut blocks = Vec::new();
398
+ let mut cursor = 0usize;
399
+
400
+ while let Some(open_offset) = lower[cursor..].find('<') {
401
+ let block_start = cursor + open_offset;
402
+ if lower.as_bytes().get(block_start + 1) == Some(&b'/') {
403
+ cursor = block_start + 1;
404
+ continue;
405
+ }
406
+ let Some(tag) = opening_tag_at(&lower, block_start, tags) else {
407
+ cursor = block_start + 1;
408
+ continue;
409
+ };
410
+ let Some(open_tag_end) = lower[block_start..].find('>').map(|idx| block_start + idx) else {
411
+ break;
412
+ };
413
+ let inner_start = open_tag_end + 1;
414
+ let close_needle = format!("</{tag}");
415
+ let Some(close_offset) = lower[inner_start..].find(&close_needle) else {
416
+ cursor = inner_start;
417
+ continue;
418
+ };
419
+ let inner_end = inner_start + close_offset;
420
+ let close_start = inner_end;
421
+ let block_end = lower[close_start..]
422
+ .find('>')
423
+ .map(|idx| close_start + idx + 1)
424
+ .unwrap_or(close_start + close_needle.len());
425
+ let attrs_start = block_start + 1 + tag.len();
426
+ blocks.push(TagBlock {
427
+ tag: tag.to_string(),
428
+ attrs: content[attrs_start..open_tag_end].to_string(),
429
+ block_start,
430
+ inner_start,
431
+ inner_end,
432
+ block_end: block_end.min(content.len()),
433
+ });
434
+ cursor = block_end;
435
+ }
436
+
437
+ blocks
438
+ }
439
+
440
+ fn opening_tag_at(lower: &str, block_start: usize, tags: &[&'static str]) -> Option<&'static str> {
441
+ tags.iter().copied().find(|tag| {
442
+ let name_start = block_start + 1;
443
+ let name_end = name_start + tag.len();
444
+ lower[name_start..].starts_with(*tag)
445
+ && lower
446
+ .as_bytes()
447
+ .get(name_end)
448
+ .is_some_and(|byte| matches!(*byte, b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r'))
449
+ })
450
+ }
451
+
452
+ fn astro_frontmatter_block(content: &str) -> Option<TagBlock> {
453
+ if !(content.starts_with("---\n") || content.starts_with("---\r\n")) {
454
+ return None;
455
+ }
456
+ let lines = line_spans(content);
457
+ let close_idx = lines
458
+ .iter()
459
+ .enumerate()
460
+ .skip(1)
461
+ .find(|(_, span)| content[span.start..span.end].trim() == "---")
462
+ .map(|(idx, _)| idx)?;
463
+ let inner_start = lines.get(1)?.start;
464
+ let inner_end = content[..lines[close_idx].start]
465
+ .strip_suffix('\n')
466
+ .map(|prefix| prefix.len())
467
+ .unwrap_or(lines[close_idx].start);
468
+ Some(TagBlock {
469
+ tag: "script".to_string(),
470
+ attrs: "lang=\"ts\"".to_string(),
471
+ block_start: 0,
472
+ inner_start,
473
+ inner_end: inner_end.max(inner_start),
474
+ block_end: lines[close_idx].next_start.min(content.len()),
475
+ })
476
+ }
477
+
478
+ fn attr_value(attrs: &str, name: &str) -> Option<String> {
479
+ let lower = attrs.to_ascii_lowercase();
480
+ let name = name.to_ascii_lowercase();
481
+ let mut cursor = 0usize;
482
+ while let Some(offset) = lower[cursor..].find(&name) {
483
+ let start = cursor + offset;
484
+ let end = start + name.len();
485
+ if !attr_name_boundary(lower.as_bytes(), start, end) {
486
+ cursor = end;
487
+ continue;
488
+ }
489
+ let mut idx = skip_ascii_whitespace(lower.as_bytes(), end);
490
+ if lower.as_bytes().get(idx) != Some(&b'=') {
491
+ cursor = end;
492
+ continue;
493
+ }
494
+ idx = skip_ascii_whitespace(lower.as_bytes(), idx + 1);
495
+ let quote = *attrs.as_bytes().get(idx)?;
496
+ if !matches!(quote, b'\'' | b'"') {
497
+ cursor = idx + 1;
498
+ continue;
499
+ }
500
+ let value_start = idx + 1;
501
+ let value_end = attrs[value_start..]
502
+ .bytes()
503
+ .position(|byte| byte == quote)
504
+ .map(|value_offset| value_start + value_offset)?;
505
+ return Some(attrs[value_start..value_end].to_ascii_lowercase());
506
+ }
507
+ None
508
+ }
509
+
510
+ fn attr_name_boundary(bytes: &[u8], start: usize, end: usize) -> bool {
511
+ let before_ok = start == 0
512
+ || !matches!(
513
+ bytes[start - 1],
514
+ b'a'..=b'z' | b'0'..=b'9' | b'_' | b'-' | b':'
515
+ );
516
+ let after_ok = end >= bytes.len()
517
+ || !matches!(
518
+ bytes[end],
519
+ b'a'..=b'z' | b'0'..=b'9' | b'_' | b'-' | b':'
520
+ );
521
+ before_ok && after_ok
522
+ }
523
+
524
+ fn skip_ascii_whitespace(bytes: &[u8], mut idx: usize) -> usize {
525
+ while bytes
526
+ .get(idx)
527
+ .is_some_and(|byte| matches!(*byte, b' ' | b'\t' | b'\n' | b'\r'))
528
+ {
529
+ idx += 1;
530
+ }
531
+ idx
532
+ }