jscpd-rs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/CHANGELOG.md +69 -0
  2. package/Cargo.lock +1323 -0
  3. package/Cargo.toml +54 -0
  4. package/LICENSE +21 -0
  5. package/README.md +372 -0
  6. package/docs/api-parity.md +49 -0
  7. package/docs/cloning-plan.md +281 -0
  8. package/docs/compat-baseline.md +535 -0
  9. package/docs/format-porting.md +86 -0
  10. package/docs/junior-task-template.md +62 -0
  11. package/docs/junior-workflow.md +87 -0
  12. package/docs/migrating-from-jscpd.md +193 -0
  13. package/docs/npm-release.md +116 -0
  14. package/docs/public-benchmark-suite.md +81 -0
  15. package/docs/release-checklist.md +200 -0
  16. package/docs/release-decisions.md +103 -0
  17. package/docs/release-readiness.md +51 -0
  18. package/docs/upstream-bugs.md +501 -0
  19. package/docs/upstream-issue-drafts.md +393 -0
  20. package/docs/user-guide.md +309 -0
  21. package/examples/dump_oxc_tokens.rs +112 -0
  22. package/examples/library_api.rs +42 -0
  23. package/npm/bin/jscpd-rs.js +6 -0
  24. package/npm/bin/jscpd-server.js +6 -0
  25. package/npm/lib/run-binary.js +68 -0
  26. package/npm/scripts/postinstall.js +50 -0
  27. package/package.json +53 -0
  28. package/skills/dry-refactoring/SKILL.md +63 -0
  29. package/skills/jscpd/SKILL.md +85 -0
  30. package/src/app.rs +512 -0
  31. package/src/bin/jscpd-server.rs +429 -0
  32. package/src/blame.rs +130 -0
  33. package/src/cli/config.rs +543 -0
  34. package/src/cli/parsing.rs +301 -0
  35. package/src/cli/tests.rs +543 -0
  36. package/src/cli.rs +671 -0
  37. package/src/detector/matching/secondary.rs +387 -0
  38. package/src/detector/matching.rs +274 -0
  39. package/src/detector/model.rs +190 -0
  40. package/src/detector/prepare.rs +71 -0
  41. package/src/detector/skip_local.rs +40 -0
  42. package/src/detector/statistics.rs +138 -0
  43. package/src/detector/store.rs +96 -0
  44. package/src/detector/tests.rs +238 -0
  45. package/src/detector.rs +265 -0
  46. package/src/files/discovery.rs +508 -0
  47. package/src/files/gitignore.rs +203 -0
  48. package/src/files/paths.rs +68 -0
  49. package/src/files/shebang.rs +106 -0
  50. package/src/files/tests.rs +523 -0
  51. package/src/files.rs +25 -0
  52. package/src/formats.rs +570 -0
  53. package/src/lib.rs +433 -0
  54. package/src/main.rs +26 -0
  55. package/src/report/ai.rs +125 -0
  56. package/src/report/badge.rs +238 -0
  57. package/src/report/console.rs +180 -0
  58. package/src/report/console_common.rs +37 -0
  59. package/src/report/console_full.rs +139 -0
  60. package/src/report/csv.rs +65 -0
  61. package/src/report/escape.rs +8 -0
  62. package/src/report/file_output.rs +28 -0
  63. package/src/report/html/assets.rs +47 -0
  64. package/src/report/html.rs +336 -0
  65. package/src/report/json.rs +119 -0
  66. package/src/report/markdown.rs +125 -0
  67. package/src/report/sarif.rs +302 -0
  68. package/src/report/silent.rs +22 -0
  69. package/src/report/source.rs +38 -0
  70. package/src/report/summary.rs +50 -0
  71. package/src/report/test_support.rs +133 -0
  72. package/src/report/threshold.rs +76 -0
  73. package/src/report/xcode.rs +90 -0
  74. package/src/report/xml.rs +119 -0
  75. package/src/report.rs +250 -0
  76. package/src/server/mcp.rs +942 -0
  77. package/src/server.rs +1081 -0
  78. package/src/tokenizer/apex.rs +97 -0
  79. package/src/tokenizer/blocks.rs +532 -0
  80. package/src/tokenizer/embedded.rs +106 -0
  81. package/src/tokenizer/generic.rs +511 -0
  82. package/src/tokenizer/hash.rs +27 -0
  83. package/src/tokenizer/ignore.rs +33 -0
  84. package/src/tokenizer/line_index.rs +33 -0
  85. package/src/tokenizer/markdown.rs +289 -0
  86. package/src/tokenizer/markup_attrs.rs +289 -0
  87. package/src/tokenizer/oxc/fallback.rs +275 -0
  88. package/src/tokenizer/oxc/jsx.rs +168 -0
  89. package/src/tokenizer/oxc/kind.rs +177 -0
  90. package/src/tokenizer/oxc/lexical.rs +67 -0
  91. package/src/tokenizer/oxc.rs +659 -0
  92. package/src/tokenizer/scan.rs +88 -0
  93. package/src/tokenizer/tap.rs +150 -0
  94. package/src/tokenizer/tests.rs +915 -0
  95. package/src/tokenizer.rs +328 -0
  96. package/src/verbose.rs +195 -0
@@ -0,0 +1,915 @@
1
+ use super::*;
2
+ use crate::cli::{Mode, Options};
3
+
4
+ fn token_slices<'a>(content: &'a str, tokens: &[DetectionToken]) -> Vec<&'a str> {
5
+ tokens
6
+ .iter()
7
+ .map(|token| &content[token.range[0]..token.range[1]])
8
+ .collect()
9
+ }
10
+
11
+ #[test]
12
+ fn tokenizes_non_whitespace_tokens_with_locations() {
13
+ let tokens =
14
+ tokenize_for_detection("let a = 1;\nlet b = 2;", "javascript", &Options::default());
15
+ assert_eq!(tokens[0].start.line, 1);
16
+ assert_eq!(tokens[5].start.line, 2);
17
+ }
18
+
19
+ #[test]
20
+ fn public_tokenizer_generates_source_maps_like_upstream_entrypoint() {
21
+ let tokenizer = Tokenizer::new();
22
+
23
+ let maps = tokenizer.generate_maps("snippet.js", "const a = 1;\nconst b = 2;", "javascript");
24
+
25
+ assert_eq!(maps.len(), 1);
26
+ assert_eq!(maps[0].source_id, "snippet.js");
27
+ assert_eq!(maps[0].format, "javascript");
28
+ assert_eq!(maps[0].tokens.len(), 10);
29
+ assert_eq!(maps[0].lines, 1);
30
+ assert_eq!(maps[0].tokens[0].start.line, 1);
31
+ }
32
+
33
+ #[test]
34
+ fn public_tokenizer_uses_configured_options() {
35
+ let tokenizer = Tokenizer::with_options(Options {
36
+ mode: Mode::Weak,
37
+ ..Options::default()
38
+ });
39
+
40
+ let tokens = tokenizer.tokenize("const a = 1; // comment\n", "javascript");
41
+
42
+ assert_eq!(tokens.len(), 5);
43
+ }
44
+
45
+ #[test]
46
+ fn skips_ignore_regions() {
47
+ let content = "keep\n// jscpd:ignore-start\nskip\n// jscpd:ignore-end\nkeep2\n";
48
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
49
+ assert_eq!(tokens.len(), 2);
50
+ }
51
+
52
+ #[test]
53
+ fn detection_tokenizer_avoids_token_value_allocations() {
54
+ let tokens =
55
+ tokenize_for_detection("let a = 1;\nlet b = 2;", "javascript", &Options::default());
56
+ assert_eq!(tokens.len(), 10);
57
+ assert_eq!(tokens[0].start.line, 1);
58
+ assert_eq!(tokens[5].start.line, 2);
59
+ }
60
+
61
+ #[test]
62
+ fn js_like_json_report_positions_count_prism_whitespace_tokens() {
63
+ let options = Options {
64
+ reporters: vec!["json".to_string()],
65
+ ..Options::default()
66
+ };
67
+ for format in ["javascript", "typescript", "jsx", "tsx"] {
68
+ let tokens = tokenize_for_detection("let a = 1;\nlet b = 2;", format, &options);
69
+ assert_eq!(tokens[0].start.position, 0);
70
+ assert_eq!(tokens[1].start.position, 2);
71
+ assert_eq!(tokens[5].start.position, 9);
72
+ }
73
+ }
74
+
75
+ #[test]
76
+ fn jsx_attribute_expression_emits_embedded_javascript_map() {
77
+ let maps = tokenize_maps_for_detection(
78
+ "const x = <div className={classNames(className, classes)} />;",
79
+ "jsx",
80
+ &Options::default(),
81
+ );
82
+ assert_eq!(maps.len(), 2);
83
+ assert_eq!(maps[0].format, "jsx");
84
+ assert_eq!(maps[1].format, "javascript");
85
+
86
+ let embedded = &maps[1].tokens;
87
+ assert_eq!(embedded.len(), 9);
88
+ assert_eq!(
89
+ embedded.last().unwrap().end.position - embedded.first().unwrap().start.position,
90
+ 8
91
+ );
92
+ }
93
+
94
+ #[test]
95
+ fn jsx_embedded_javascript_keeps_nested_object_whitespace() {
96
+ let content = "const x = <A p={{\n color: PRIMARY_COLOR\n}} />;";
97
+ let maps = tokenize_maps_for_detection(content, "tsx", &Options::default());
98
+ let embedded = maps
99
+ .iter()
100
+ .find(|map| map.format == "javascript")
101
+ .expect("embedded javascript map");
102
+
103
+ assert!(
104
+ embedded
105
+ .tokens
106
+ .iter()
107
+ .any(|token| &content[token.range[0]..token.range[1]] == "\n ")
108
+ );
109
+ }
110
+
111
+ #[test]
112
+ fn jsx_text_is_split_like_javascript_text() {
113
+ let content = r#"const x = <div>Hello, "Go" this.</div>;"#;
114
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
115
+ let values = token_slices(content, &tokens);
116
+
117
+ assert!(values.contains(&"Hello"));
118
+ assert!(values.contains(&","));
119
+ assert!(values.contains(&r#""Go""#));
120
+ assert!(values.contains(&"this"));
121
+ }
122
+
123
+ #[test]
124
+ fn jsx_text_unclosed_quote_stops_at_line_end() {
125
+ let content = "const x = <div>\"Captured an\nerror: null\". Clicking</div>;";
126
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
127
+ let values = token_slices(content, &tokens);
128
+
129
+ assert!(values.contains(&"\"Captured"));
130
+ assert!(values.contains(&"error"));
131
+ assert!(values.contains(&"null"));
132
+ assert!(values.contains(&"\""));
133
+ assert!(values.contains(&"."));
134
+ }
135
+
136
+ #[test]
137
+ fn jsx_dashed_identifiers_are_split_like_prism() {
138
+ let content = r#"expect(root).toMatchRenderedOutput(<suspensey-thing src="A" />);"#;
139
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
140
+ let values = token_slices(content, &tokens);
141
+
142
+ assert!(values.contains(&"suspensey"));
143
+ assert!(values.contains(&"-"));
144
+ assert!(values.contains(&"thing"));
145
+ assert!(!values.contains(&"suspensey-thing"));
146
+ }
147
+
148
+ #[test]
149
+ fn js_spread_token_is_operator_like_prism() {
150
+ let content = "const next = [...items];";
151
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
152
+ let spread = tokens
153
+ .iter()
154
+ .find(|token| &content[token.range[0]..token.range[1]] == "...")
155
+ .expect("spread token");
156
+
157
+ assert_eq!(spread.hash, hash_token(TokenKind::Operator, "...", false));
158
+ }
159
+
160
+ #[test]
161
+ fn generic_tokenizer_handles_common_non_native_formats() {
162
+ for format in ["css", "markup", "yaml", "toml", "python"] {
163
+ let maps = tokenize_maps_for_detection("alpha beta\n gamma", format, &Options::default());
164
+
165
+ assert_eq!(maps.len(), 1);
166
+ assert_eq!(maps[0].format, format);
167
+ assert_eq!(maps[0].tokens.len(), 3);
168
+ }
169
+ }
170
+
171
+ #[test]
172
+ fn all_supported_formats_have_a_tokenizer_smoke_path() {
173
+ for format in crate::formats::supported_formats() {
174
+ let content = smoke_content_for_format(format);
175
+ let maps = tokenize_maps_for_detection(content, format, &Options::default());
176
+ assert!(
177
+ maps.iter().any(|map| !map.tokens.is_empty()),
178
+ "format {format} produced no tokens"
179
+ );
180
+ assert!(
181
+ maps.iter()
182
+ .all(|map| crate::formats::supported_formats().contains(&map.format.as_str())),
183
+ "format {format} produced an unsupported embedded map"
184
+ );
185
+ }
186
+ }
187
+
188
+ fn smoke_content_for_format(format: &str) -> &'static str {
189
+ match format {
190
+ "astro" => "---\nconst title = 'Demo';\n---\n<section>{title}</section>\n",
191
+ "jsx" => "const view = <section>{title}</section>;\n",
192
+ "markdown" => "# Demo\n\n```js\nconst value = 1;\n```\n",
193
+ "markup" => "<section><span>alpha beta</span></section>\n",
194
+ "svelte" => "<script>let title = 'Demo';</script>\n<h1>{title}</h1>\n",
195
+ "tsx" => "const view: JSX.Element = <section>{title}</section>;\n",
196
+ "vue" => "<template>\n <section>{{ title }}</section>\n</template>\n",
197
+ _ => "alpha beta gamma\nalpha beta delta\n",
198
+ }
199
+ }
200
+
201
+ #[test]
202
+ fn haml_comment_block_is_single_comment_token() {
203
+ let content = "%section\n %p Same\n-# File-specific comment\n .settings\n %h2 Title\n";
204
+ let tokens = tokenize_for_detection(content, "haml", &Options::default());
205
+ let comment = tokens
206
+ .iter()
207
+ .find(|token| content[token.range[0]..token.range[1]].starts_with("-#"))
208
+ .expect("haml comment token");
209
+
210
+ assert_eq!(comment.start.line, 3);
211
+ assert_eq!(comment.end.line, 5);
212
+ assert_eq!(
213
+ &content[comment.range[0]..comment.range[1]],
214
+ "-# File-specific comment\n .settings\n %h2 Title"
215
+ );
216
+ }
217
+
218
+ #[test]
219
+ fn pug_dot_block_is_single_plain_text_token() {
220
+ let content = "style.\n .panel {\n color: red;\n }\nbody\n";
221
+ let tokens = tokenize_for_detection(content, "pug", &Options::default());
222
+ let block = tokens
223
+ .iter()
224
+ .find(|token| content[token.range[0]..token.range[1]].starts_with("style."))
225
+ .expect("pug dot block token");
226
+
227
+ assert_eq!(block.start.line, 1);
228
+ assert_eq!(block.end.line, 4);
229
+ assert_eq!(
230
+ &content[block.range[0]..block.range[1]],
231
+ "style.\n .panel {\n color: red;\n }"
232
+ );
233
+ }
234
+
235
+ #[test]
236
+ fn markdown_fenced_javascript_emits_embedded_map() {
237
+ let content = "# Demo\n\n```js\nfunction alpha() {\n return 42;\n}\n```\n";
238
+ let maps = tokenize_maps_for_detection(content, "markdown", &Options::default());
239
+
240
+ assert!(maps.iter().any(|map| map.format == "markdown"));
241
+ let javascript = maps
242
+ .iter()
243
+ .find(|map| map.format == "javascript")
244
+ .expect("embedded javascript map");
245
+
246
+ assert_eq!(javascript.tokens[0].start.line, 4);
247
+ assert_eq!(javascript.tokens[0].start.column, 1);
248
+ assert_eq!(
249
+ &content[javascript.tokens[0].range[0]..javascript.tokens[0].range[1]],
250
+ "function"
251
+ );
252
+ }
253
+
254
+ #[test]
255
+ fn markdown_fenced_code_is_removed_from_markdown_map() {
256
+ let content = "before\n\n```ts\nconst hidden = true;\n```\n\nafter\n";
257
+ let maps = tokenize_maps_for_detection(content, "markdown", &Options::default());
258
+ let markdown = maps
259
+ .iter()
260
+ .find(|map| map.format == "markdown")
261
+ .expect("markdown map");
262
+ let markdown_values = token_slices(content, &markdown.tokens);
263
+
264
+ assert!(markdown_values.contains(&"before"));
265
+ assert!(markdown_values.contains(&"after"));
266
+ assert!(!markdown_values.contains(&"hidden"));
267
+ }
268
+
269
+ #[test]
270
+ fn markdown_fenced_gap_tokens_stay_on_their_lines() {
271
+ let content = "before\n```ts\nconst hidden = true;\n```\nafter\n";
272
+ let maps = tokenize_maps_for_detection(content, "markdown", &Options::default());
273
+ let markdown = maps
274
+ .iter()
275
+ .find(|map| map.format == "markdown")
276
+ .expect("markdown map");
277
+ let gap_tokens = markdown
278
+ .tokens
279
+ .iter()
280
+ .filter(|token| (2..=4).contains(&token.start.line))
281
+ .collect::<Vec<_>>();
282
+
283
+ assert!(gap_tokens.iter().any(|token| token.start.line == 3));
284
+ assert!(
285
+ gap_tokens
286
+ .iter()
287
+ .all(|token| token.start.line == token.end.line)
288
+ );
289
+ }
290
+
291
+ #[test]
292
+ fn markdown_fenced_typescript_uses_language_name() {
293
+ let content = "```typescript\ntype Answer = number;\n```\n";
294
+ let maps = tokenize_maps_for_detection(content, "markdown", &Options::default());
295
+
296
+ assert!(maps.iter().any(|map| map.format == "typescript"));
297
+ }
298
+
299
+ #[test]
300
+ fn markdown_front_matter_emits_yaml_map() {
301
+ let content = "---\ntitle: Demo\ntags:\n - docs\n---\n# Demo\n";
302
+ let maps = tokenize_maps_for_detection(content, "markdown", &Options::default());
303
+ let yaml = maps
304
+ .iter()
305
+ .find(|map| map.format == "yaml")
306
+ .expect("front matter yaml map");
307
+
308
+ assert_eq!(yaml.tokens[0].start.line, 2);
309
+ assert_eq!(
310
+ &content[yaml.tokens[0].range[0]..yaml.tokens[0].range[1]],
311
+ "title"
312
+ );
313
+ assert_eq!(
314
+ &content[yaml.tokens[1].range[0]..yaml.tokens[1].range[1]],
315
+ ":"
316
+ );
317
+ }
318
+
319
+ #[test]
320
+ fn markdown_embedded_generic_blocks_keep_whitespace_tokens() {
321
+ let content =
322
+ "```coffeescript\njscpd = require 'jscpd'\nresult = jscpd::run\n reporter: json\n```\n";
323
+ let maps = tokenize_maps_for_detection(content, "markdown", &Options::default());
324
+ let coffeescript = maps
325
+ .iter()
326
+ .find(|map| map.format == "coffeescript")
327
+ .expect("coffeescript map");
328
+
329
+ assert!(
330
+ coffeescript
331
+ .tokens
332
+ .iter()
333
+ .any(|token| &content[token.range[0]..token.range[1]] == "\n")
334
+ );
335
+ }
336
+
337
+ #[test]
338
+ fn markup_emits_embedded_script_and_style_maps() {
339
+ let content = "<html>\n<script language=\"JavaScript\">\nfunction demo() { return 1; }\n</script>\n<style type=\"text/css\">\nbody { color: red; }\n</style>\n</html>\n";
340
+ let maps = tokenize_maps_for_detection(content, "markup", &Options::default());
341
+
342
+ assert!(maps.iter().any(|map| map.format == "markup"));
343
+ let javascript = maps
344
+ .iter()
345
+ .find(|map| map.format == "javascript")
346
+ .expect("embedded javascript map");
347
+ let css = maps
348
+ .iter()
349
+ .find(|map| map.format == "css")
350
+ .expect("embedded css map");
351
+
352
+ assert_eq!(javascript.tokens[0].start.line, 3);
353
+ assert_eq!(
354
+ &content[javascript.tokens[0].range[0]..javascript.tokens[0].range[1]],
355
+ "function"
356
+ );
357
+ let body = css
358
+ .tokens
359
+ .iter()
360
+ .find(|token| &content[token.range[0]..token.range[1]] == "body")
361
+ .expect("body selector token");
362
+ assert_eq!(body.start.line, 6);
363
+ }
364
+
365
+ #[test]
366
+ fn markup_emits_inline_style_attr_css_map() {
367
+ let content = "<h4 style=\"visibility: hidden\">Order Search</h4>\n";
368
+ let maps = tokenize_maps_for_detection(content, "markup", &Options::default());
369
+
370
+ let css = maps
371
+ .iter()
372
+ .find(|map| map.format == "css")
373
+ .expect("inline style css map");
374
+ let values = token_slices(content, &css.tokens);
375
+
376
+ assert_eq!(
377
+ values,
378
+ vec![" ", "style", "=\"", "visibility", ":", " hidden", "\""]
379
+ );
380
+ assert_eq!(css.tokens[0].start.line, 1);
381
+ assert_eq!(css.tokens[0].start.column, 4);
382
+
383
+ let markup = maps
384
+ .iter()
385
+ .find(|map| map.format == "markup")
386
+ .expect("markup map");
387
+ assert!(
388
+ !markup
389
+ .tokens
390
+ .iter()
391
+ .any(|token| &content[token.range[0]..token.range[1]] == "style")
392
+ );
393
+ }
394
+
395
+ #[test]
396
+ fn markup_inline_style_attr_respects_ignore_regions() {
397
+ let content = "<!-- jscpd:ignore-start -->\n<h4 style=\"visibility: hidden\">Order Search</h4>\n<!-- jscpd:ignore-end -->\n";
398
+ let maps = tokenize_maps_for_detection(content, "markup", &Options::default());
399
+
400
+ assert!(maps.iter().all(|map| map.format != "css"));
401
+ }
402
+
403
+ #[test]
404
+ fn vue_sfc_emits_template_script_and_style_maps() {
405
+ let content = "<template>\n <section>{{ title }}</section>\n</template>\n<style lang=\"scss\">\n.panel { color: red; }\n</style>\n<script setup lang=\"ts\">\nconst title: string = 'Demo';\n</script>\n";
406
+ let maps = tokenize_maps_for_detection(content, "vue", &Options::default());
407
+
408
+ assert!(maps.iter().any(|map| map.format == "markup"));
409
+ assert!(maps.iter().any(|map| map.format == "scss"));
410
+ let typescript = maps
411
+ .iter()
412
+ .find(|map| map.format == "typescript")
413
+ .expect("typescript map");
414
+
415
+ assert_eq!(typescript.tokens[0].start.line, 8);
416
+ assert_eq!(
417
+ &content[typescript.tokens[0].range[0]..typescript.tokens[0].range[1]],
418
+ "const"
419
+ );
420
+ }
421
+
422
+ #[test]
423
+ fn vue_sfc_trims_edge_whitespace_from_embedded_block_maps() {
424
+ let content = "<template>\n <section>{{ title }}</section>\n</template>\n<style lang=\"scss\">\n.panel { color: red; }\n</style>\n";
425
+ let maps = tokenize_maps_for_detection(content, "vue", &Options::default());
426
+ let markup = maps
427
+ .iter()
428
+ .find(|map| map.format == "markup")
429
+ .expect("markup map");
430
+ let scss = maps
431
+ .iter()
432
+ .find(|map| map.format == "scss")
433
+ .expect("scss map");
434
+
435
+ assert_eq!(markup.tokens[0].start.line, 2);
436
+ assert_eq!(markup.tokens.last().unwrap().end.line, 2);
437
+ assert_eq!(scss.tokens[0].start.line, 5);
438
+ assert_eq!(scss.tokens.last().unwrap().end.line, 5);
439
+ }
440
+
441
+ #[test]
442
+ fn vue_sfc_style_blocks_skip_internal_whitespace_tokens() {
443
+ let content = "<style lang=\"scss\">\n.panel {\n color: red;\n}\n</style>\n";
444
+ let maps = tokenize_maps_for_detection(content, "vue", &Options::default());
445
+ let scss = maps
446
+ .iter()
447
+ .find(|map| map.format == "scss")
448
+ .expect("scss map");
449
+ let values = token_slices(content, &scss.tokens);
450
+
451
+ assert!(
452
+ !values
453
+ .iter()
454
+ .any(|value| value.chars().all(char::is_whitespace))
455
+ );
456
+ }
457
+
458
+ #[test]
459
+ fn vue_template_emits_inline_style_attr_css_map() {
460
+ let content = "<template>\n <div style=\"color: red\">{{ title }}</div>\n</template>\n";
461
+ let maps = tokenize_maps_for_detection(content, "vue", &Options::default());
462
+
463
+ let css = maps
464
+ .iter()
465
+ .find(|map| map.format == "css")
466
+ .expect("inline style css map");
467
+ let values = token_slices(content, &css.tokens);
468
+
469
+ assert_eq!(
470
+ values,
471
+ vec![" ", "style", "=\"", "color", ":", " red", "\""]
472
+ );
473
+ assert_eq!(css.tokens[0].start.line, 2);
474
+ assert_eq!(css.tokens[0].start.column, 7);
475
+ }
476
+
477
+ #[test]
478
+ fn svelte_sfc_emits_markup_script_and_style_maps() {
479
+ let content = "<script>\nlet title = 'Demo';\n</script>\n<h1>{title}</h1>\n<style>\nh1 { color: red; }\n</style>\n";
480
+ let maps = tokenize_maps_for_detection(content, "svelte", &Options::default());
481
+
482
+ assert!(maps.iter().any(|map| map.format == "markup"));
483
+ assert!(maps.iter().any(|map| map.format == "javascript"));
484
+ let css = maps
485
+ .iter()
486
+ .find(|map| map.format == "css")
487
+ .expect("css map");
488
+ let h1 = css
489
+ .tokens
490
+ .iter()
491
+ .find(|token| &content[token.range[0]..token.range[1]] == "h1")
492
+ .expect("h1 selector token");
493
+
494
+ assert_eq!(h1.start.line, 6);
495
+ }
496
+
497
+ #[test]
498
+ fn astro_sfc_emits_frontmatter_script_style_and_markup_maps() {
499
+ let content = "---\nconst title: string = 'Demo';\n---\n<article>{title}</article>\n<script>\nconsole.log(title);\n</script>\n<style>\narticle { color: red; }\n</style>\n";
500
+ let maps = tokenize_maps_for_detection(content, "astro", &Options::default());
501
+
502
+ assert!(maps.iter().any(|map| map.format == "markup"));
503
+ assert!(maps.iter().any(|map| map.format == "javascript"));
504
+ assert!(maps.iter().any(|map| map.format == "css"));
505
+ let typescript = maps
506
+ .iter()
507
+ .find(|map| map.format == "typescript")
508
+ .expect("frontmatter typescript map");
509
+
510
+ assert_eq!(typescript.tokens[0].start.line, 2);
511
+ assert_eq!(
512
+ &content[typescript.tokens[0].range[0]..typescript.tokens[0].range[1]],
513
+ "const"
514
+ );
515
+ }
516
+
517
+ #[test]
518
+ fn astro_markup_trims_blanked_frontmatter_whitespace() {
519
+ let content = "---\nconst title = 'Hello';\n---\n\n<main>{title}</main>\n";
520
+ let maps = tokenize_maps_for_detection(content, "astro", &Options::default());
521
+ let markup = maps
522
+ .iter()
523
+ .find(|map| map.format == "markup")
524
+ .expect("markup map");
525
+
526
+ assert_eq!(markup.tokens[0].start.line, 5);
527
+ assert_eq!(
528
+ &content[markup.tokens[0].range[0]..markup.tokens[0].range[1]],
529
+ "<"
530
+ );
531
+ }
532
+
533
+ #[test]
534
+ fn apex_soql_blocks_emit_sql_map() {
535
+ let content = "public class A {\n Account acc = [\n SELECT Id\n FROM Account\n ];\n}\n";
536
+ let maps = tokenize_maps_for_detection(content, "apex", &Options::default());
537
+
538
+ assert!(maps.iter().any(|map| map.format == "apex"));
539
+ let sql = maps
540
+ .iter()
541
+ .find(|map| map.format == "sql")
542
+ .expect("sql map");
543
+ let first = sql.tokens.first().expect("sql token");
544
+
545
+ assert_eq!(first.start.line, 2);
546
+ assert_eq!(&content[first.range[0]..first.range[1]], "[");
547
+ }
548
+
549
+ #[test]
550
+ fn tap_yamlish_blocks_emit_yaml_map() {
551
+ let content = "not ok 1 - failed\n ---\n message: Expected value\n actual: null\n ...\n";
552
+ let maps = tokenize_maps_for_detection(content, "tap", &Options::default());
553
+ let yaml = maps
554
+ .iter()
555
+ .find(|map| map.format == "yaml")
556
+ .expect("yaml map");
557
+
558
+ assert_eq!(yaml.tokens[0].start.line, 2);
559
+ assert_eq!(yaml.tokens[0].start.column, 3);
560
+ assert_eq!(
561
+ &content[yaml.tokens[0].range[0]..yaml.tokens[0].range[1]],
562
+ "---"
563
+ );
564
+ assert!(maps.iter().any(|map| map.format == "tap"));
565
+ }
566
+
567
+ #[test]
568
+ fn weak_mode_skips_generic_comments() {
569
+ let content = "# first comment\nalpha beta\n// second comment\ngamma\n";
570
+ let weak_options = Options {
571
+ mode: crate::cli::Mode::Weak,
572
+ ..Options::default()
573
+ };
574
+
575
+ let strong = tokenize_for_detection(content, "yaml", &Options::default());
576
+ let weak = tokenize_for_detection(content, "yaml", &weak_options);
577
+
578
+ assert_eq!(strong.len(), 5);
579
+ assert_eq!(weak.len(), 3);
580
+ }
581
+
582
+ #[test]
583
+ fn yaml_quoted_scalars_are_single_string_tokens() {
584
+ let content = "email: \"jane@example.com\"\n";
585
+ let tokens = tokenize_for_detection(content, "yaml", &Options::default());
586
+ let values = token_slices(content, &tokens);
587
+
588
+ assert_eq!(values, vec!["email", ":", "\"jane@example.com\""]);
589
+ }
590
+
591
+ #[test]
592
+ fn strict_mode_keeps_generic_whitespace_tokens() {
593
+ let content = "alpha beta\ngamma";
594
+ let strict_options = Options {
595
+ mode: crate::cli::Mode::Strict,
596
+ ..Options::default()
597
+ };
598
+
599
+ let mild = tokenize_for_detection(content, "yaml", &Options::default());
600
+ let strict = tokenize_for_detection(content, "yaml", &strict_options);
601
+ let token_values = token_slices(content, &strict);
602
+
603
+ assert_eq!(mild.len(), 3);
604
+ assert_eq!(token_values, vec!["alpha", " ", "beta", "\n", "gamma"]);
605
+ }
606
+
607
+ #[test]
608
+ fn strict_mode_keeps_js_whitespace_tokens() {
609
+ let content = "let a = 1;\nlet b = 2;";
610
+ let strict_options = Options {
611
+ mode: crate::cli::Mode::Strict,
612
+ ..Options::default()
613
+ };
614
+
615
+ let mild = tokenize_for_detection(content, "javascript", &Options::default());
616
+ let strict = tokenize_for_detection(content, "javascript", &strict_options);
617
+ let token_values = token_slices(content, &strict);
618
+
619
+ assert_eq!(mild.len(), 10);
620
+ assert!(strict.len() > mild.len());
621
+ assert!(token_values.contains(&" "));
622
+ assert!(token_values.contains(&"\n"));
623
+ }
624
+
625
+ #[test]
626
+ fn weak_mode_skips_generic_double_dash_comments() {
627
+ let content = "-- first comment\nselect one\n-- second comment\nfrom table\n";
628
+ let weak_options = Options {
629
+ mode: crate::cli::Mode::Weak,
630
+ ..Options::default()
631
+ };
632
+
633
+ let strong = tokenize_for_detection(content, "sql", &Options::default());
634
+ let weak = tokenize_for_detection(content, "sql", &weak_options);
635
+ let token_values = token_slices(content, &weak);
636
+
637
+ assert_eq!(strong.len(), 6);
638
+ assert_eq!(token_values, vec!["select", "one", "from", "table"]);
639
+ }
640
+
641
+ #[test]
642
+ fn weak_mode_skips_generic_semicolon_comments() {
643
+ let content = "; first comment\n[main]\nkey=value\n ; second comment\nother=value\n";
644
+ let weak_options = Options {
645
+ mode: crate::cli::Mode::Weak,
646
+ ..Options::default()
647
+ };
648
+
649
+ let strong = tokenize_for_detection(content, "ini", &Options::default());
650
+ let weak = tokenize_for_detection(content, "ini", &weak_options);
651
+ let token_values = token_slices(content, &weak);
652
+
653
+ assert_eq!(strong.len(), 11);
654
+ assert_eq!(
655
+ token_values,
656
+ vec!["[", "main", "]", "key", "=", "value", "other", "=", "value"]
657
+ );
658
+ }
659
+
660
+ #[test]
661
+ fn generic_css_ids_are_not_treated_as_hash_comments() {
662
+ let options = Options {
663
+ mode: crate::cli::Mode::Weak,
664
+ ..Options::default()
665
+ };
666
+ let tokens = tokenize_for_detection("#app .title\n", "css", &options);
667
+
668
+ assert_eq!(tokens.len(), 2);
669
+ }
670
+
671
+ #[test]
672
+ fn css_like_tokenizer_splits_punctuation() {
673
+ let content = "#app .title { color: saturate(@base, 5%); }";
674
+ let tokens = tokenize_for_detection(content, "css", &Options::default());
675
+ let token_values = token_slices(content, &tokens);
676
+
677
+ assert_eq!(
678
+ token_values,
679
+ vec![
680
+ "#app", ".title", "{", "color", ":", "saturate", "(", "@base", ",", "5%", ")", ";", "}"
681
+ ]
682
+ );
683
+ }
684
+
685
+ #[test]
686
+ fn code_like_tokenizer_splits_punctuation_and_operators() {
687
+ let content = "fn call<T>(x: i32) -> bool { x >= 1 }";
688
+ let tokens = tokenize_for_detection(content, "rust", &Options::default());
689
+ let token_values = token_slices(content, &tokens);
690
+
691
+ assert_eq!(
692
+ token_values,
693
+ vec![
694
+ "fn", "call", "<", "T", ">", "(", "x", ":", "i32", ")", "->", "bool", "{", "x", ">=",
695
+ "1", "}"
696
+ ]
697
+ );
698
+ }
699
+
700
+ #[test]
701
+ fn long_tail_code_like_formats_split_punctuation_and_operators() {
702
+ let content = "value = call(item, 1);";
703
+ for format in [
704
+ "aspnet",
705
+ "cfml",
706
+ "cfscript",
707
+ "clojure",
708
+ "cmake",
709
+ "coffeescript",
710
+ "csv",
711
+ "dot",
712
+ "eiffel",
713
+ "haml",
714
+ "ini",
715
+ "markup",
716
+ "ocaml",
717
+ "plsql",
718
+ "purescript",
719
+ "python",
720
+ "qsharp",
721
+ "rescript",
722
+ "robotframework",
723
+ "sparql",
724
+ "tt2",
725
+ "yaml",
726
+ ] {
727
+ let tokens = tokenize_for_detection(content, format, &Options::default());
728
+ let token_values = token_slices(content, &tokens);
729
+
730
+ assert_eq!(
731
+ token_values,
732
+ vec!["value", "=", "call", "(", "item", ",", "1", ")", ";"],
733
+ "{format}"
734
+ );
735
+ }
736
+ }
737
+
738
+ #[test]
739
+ fn twig_mild_mode_keeps_prism_default_whitespace() {
740
+ let content = "<p>a</p>\n <p>b</p>\n{% include \"helper.html\" %}\n";
741
+ let tokens = tokenize_for_detection(content, "twig", &Options::default());
742
+ let values = token_slices(content, &tokens);
743
+
744
+ assert!(values.contains(&"\n "));
745
+ assert!(values.contains(&" "));
746
+ assert!(!values.contains(&"\n"));
747
+ }
748
+
749
+ #[test]
750
+ fn weak_mode_skips_js_comments() {
751
+ let options = Options {
752
+ mode: crate::cli::Mode::Weak,
753
+ ..Options::default()
754
+ };
755
+ let strong = tokenize_for_detection(
756
+ "const a = 1; // comment\n",
757
+ "javascript",
758
+ &Options::default(),
759
+ );
760
+ let weak = tokenize_for_detection("const a = 1; // comment\n", "javascript", &options);
761
+ assert!(strong.len() > weak.len());
762
+ }
763
+
764
+ #[test]
765
+ fn js_line_comments_split_into_comment_tokens() {
766
+ let content = "// really an argument\nconst a = 1;\n";
767
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
768
+ let comment_values = tokens
769
+ .iter()
770
+ .filter_map(|token| {
771
+ let value = &content[token.range[0]..token.range[1]];
772
+ value.starts_with("//").then_some(value)
773
+ })
774
+ .collect::<Vec<_>>();
775
+
776
+ assert_eq!(comment_values, vec!["//"]);
777
+ assert!(
778
+ tokens
779
+ .iter()
780
+ .any(|token| &content[token.range[0]..token.range[1]] == "really")
781
+ );
782
+ }
783
+
784
+ #[test]
785
+ fn js_hashbang_splits_like_prism() {
786
+ let content = "#!/usr/bin/env node\n'use strict';\n";
787
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
788
+ let values = token_slices(content, &tokens)
789
+ .into_iter()
790
+ .take(9)
791
+ .collect::<Vec<_>>();
792
+
793
+ assert_eq!(
794
+ values,
795
+ vec!["#", "!", "/", "usr", "/", "bin", "/", "env", "node"]
796
+ );
797
+ assert_eq!(tokens[0].hash, hash_token(TokenKind::Default, "#", false));
798
+ assert_eq!(tokens[1].hash, hash_token(TokenKind::Operator, "!", false));
799
+ }
800
+
801
+ #[test]
802
+ fn splits_template_interpolation_like_prism() {
803
+ let tokens = tokenize_for_detection(
804
+ "const x = `a${b}c${d}e`;",
805
+ "typescript",
806
+ &Options::default(),
807
+ );
808
+ assert_eq!(tokens.len(), 13);
809
+ assert_eq!(tokens[3].start.column, 11);
810
+ assert_eq!(tokens[4].start.column, 13);
811
+ assert_eq!(tokens[6].start.column, 16);
812
+ assert_eq!(tokens[8].start.column, 18);
813
+ assert_eq!(tokens[10].start.column, 21);
814
+ assert_eq!(tokens[11].start.column, 22);
815
+ }
816
+
817
+ #[test]
818
+ fn keeps_template_interpolation_space_tokens_like_prism() {
819
+ let content = "const x = `${store ? '[Store]' : '[No Store]'}`;";
820
+ let tokens = tokenize_for_detection(content, "typescript", &Options::default());
821
+ let values = token_slices(content, &tokens);
822
+
823
+ assert!(
824
+ values
825
+ .windows(3)
826
+ .any(|window| window == ["store", " ", "?"])
827
+ );
828
+ assert!(
829
+ values
830
+ .windows(3)
831
+ .any(|window| window == ["?", " ", "'[Store]'"])
832
+ );
833
+ assert!(
834
+ values
835
+ .windows(3)
836
+ .any(|window| window == ["'[Store]'", " ", ":"])
837
+ );
838
+ assert!(
839
+ values
840
+ .windows(3)
841
+ .any(|window| window == [":", " ", "'[No Store]'"])
842
+ );
843
+ }
844
+
845
+ #[test]
846
+ fn splits_optional_chaining_like_prism() {
847
+ let tokens = tokenize_for_detection("a?.b", "typescript", &Options::default());
848
+ assert_eq!(tokens.len(), 4);
849
+ assert_eq!(tokens[1].start.column, 2);
850
+ assert_eq!(tokens[2].start.column, 3);
851
+ assert_eq!(tokens[3].start.column, 4);
852
+ }
853
+
854
+ #[test]
855
+ fn merges_adjacent_generic_closing_angles_like_prism() {
856
+ let tokens = tokenize_for_detection("type A = X<Y<Z>>;", "typescript", &Options::default());
857
+ assert_eq!(tokens.len(), 10);
858
+ assert_eq!(tokens[8].start.column, 15);
859
+ assert_eq!(tokens[8].end.column, 17);
860
+ assert_eq!(tokens[9].start.column, 17);
861
+ }
862
+
863
+ #[test]
864
+ fn js_regex_after_recoverable_parse_error_stays_single_token() {
865
+ let content = "export type Flags = {enabled: boolean};\n\
866
+ export function normalize(str) {\n\
867
+ return str.replace(/Check your code at .+?:\\d+/g, 'Check your code at **');\n\
868
+ }\n";
869
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
870
+ let values = token_slices(content, &tokens);
871
+
872
+ assert!(values.contains(&"/Check your code at .+?:\\d+/g"));
873
+ assert!(!values.windows(2).any(|window| window == ["/", "Check"]));
874
+ }
875
+
876
+ #[test]
877
+ fn typescript_array_regex_splits_like_prism() {
878
+ let content = r#"const restrictions = [/\.css$/i];"#;
879
+ let tokens = tokenize_for_detection(content, "typescript", &Options::default());
880
+ let values = token_slices(content, &tokens);
881
+
882
+ assert!(
883
+ values
884
+ .windows(6)
885
+ .any(|window| window == ["/", "\\", ".", "css$", "/", "i"])
886
+ );
887
+ assert!(!values.contains(&r#"/\.css$/i"#));
888
+ }
889
+
890
+ #[test]
891
+ fn js_division_after_identifier_is_not_recovered_as_regex() {
892
+ let content = "const ratio = total / count / scale;\n";
893
+ let tokens = tokenize_for_detection(content, "javascript", &Options::default());
894
+ let values = token_slices(content, &tokens);
895
+
896
+ assert!(values.contains(&"/"));
897
+ assert!(!values.contains(&"/ count /"));
898
+ }
899
+
900
+ #[test]
901
+ fn weak_mode_skips_generic_markup_comments() {
902
+ let content = "<!-- comment -->\nalpha beta\n<!-- another -->\ngamma\n";
903
+ let weak_options = Options {
904
+ mode: crate::cli::Mode::Weak,
905
+ ..Options::default()
906
+ };
907
+
908
+ let strong = tokenize_for_detection(content, "markup", &Options::default());
909
+ let weak = tokenize_for_detection(content, "markup", &weak_options);
910
+
911
+ assert_eq!(strong.len(), 5);
912
+ assert_eq!(weak.len(), 3);
913
+ let token_values = token_slices(content, &weak);
914
+ assert_eq!(token_values, vec!["alpha", "beta", "gamma"]);
915
+ }