jscpd-rs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/CHANGELOG.md +69 -0
  2. package/Cargo.lock +1323 -0
  3. package/Cargo.toml +54 -0
  4. package/LICENSE +21 -0
  5. package/README.md +372 -0
  6. package/docs/api-parity.md +49 -0
  7. package/docs/cloning-plan.md +281 -0
  8. package/docs/compat-baseline.md +535 -0
  9. package/docs/format-porting.md +86 -0
  10. package/docs/junior-task-template.md +62 -0
  11. package/docs/junior-workflow.md +87 -0
  12. package/docs/migrating-from-jscpd.md +193 -0
  13. package/docs/npm-release.md +116 -0
  14. package/docs/public-benchmark-suite.md +81 -0
  15. package/docs/release-checklist.md +200 -0
  16. package/docs/release-decisions.md +103 -0
  17. package/docs/release-readiness.md +51 -0
  18. package/docs/upstream-bugs.md +501 -0
  19. package/docs/upstream-issue-drafts.md +393 -0
  20. package/docs/user-guide.md +309 -0
  21. package/examples/dump_oxc_tokens.rs +112 -0
  22. package/examples/library_api.rs +42 -0
  23. package/npm/bin/jscpd-rs.js +6 -0
  24. package/npm/bin/jscpd-server.js +6 -0
  25. package/npm/lib/run-binary.js +68 -0
  26. package/npm/scripts/postinstall.js +50 -0
  27. package/package.json +53 -0
  28. package/skills/dry-refactoring/SKILL.md +63 -0
  29. package/skills/jscpd/SKILL.md +85 -0
  30. package/src/app.rs +512 -0
  31. package/src/bin/jscpd-server.rs +429 -0
  32. package/src/blame.rs +130 -0
  33. package/src/cli/config.rs +543 -0
  34. package/src/cli/parsing.rs +301 -0
  35. package/src/cli/tests.rs +543 -0
  36. package/src/cli.rs +671 -0
  37. package/src/detector/matching/secondary.rs +387 -0
  38. package/src/detector/matching.rs +274 -0
  39. package/src/detector/model.rs +190 -0
  40. package/src/detector/prepare.rs +71 -0
  41. package/src/detector/skip_local.rs +40 -0
  42. package/src/detector/statistics.rs +138 -0
  43. package/src/detector/store.rs +96 -0
  44. package/src/detector/tests.rs +238 -0
  45. package/src/detector.rs +265 -0
  46. package/src/files/discovery.rs +508 -0
  47. package/src/files/gitignore.rs +203 -0
  48. package/src/files/paths.rs +68 -0
  49. package/src/files/shebang.rs +106 -0
  50. package/src/files/tests.rs +523 -0
  51. package/src/files.rs +25 -0
  52. package/src/formats.rs +570 -0
  53. package/src/lib.rs +433 -0
  54. package/src/main.rs +26 -0
  55. package/src/report/ai.rs +125 -0
  56. package/src/report/badge.rs +238 -0
  57. package/src/report/console.rs +180 -0
  58. package/src/report/console_common.rs +37 -0
  59. package/src/report/console_full.rs +139 -0
  60. package/src/report/csv.rs +65 -0
  61. package/src/report/escape.rs +8 -0
  62. package/src/report/file_output.rs +28 -0
  63. package/src/report/html/assets.rs +47 -0
  64. package/src/report/html.rs +336 -0
  65. package/src/report/json.rs +119 -0
  66. package/src/report/markdown.rs +125 -0
  67. package/src/report/sarif.rs +302 -0
  68. package/src/report/silent.rs +22 -0
  69. package/src/report/source.rs +38 -0
  70. package/src/report/summary.rs +50 -0
  71. package/src/report/test_support.rs +133 -0
  72. package/src/report/threshold.rs +76 -0
  73. package/src/report/xcode.rs +90 -0
  74. package/src/report/xml.rs +119 -0
  75. package/src/report.rs +250 -0
  76. package/src/server/mcp.rs +942 -0
  77. package/src/server.rs +1081 -0
  78. package/src/tokenizer/apex.rs +97 -0
  79. package/src/tokenizer/blocks.rs +532 -0
  80. package/src/tokenizer/embedded.rs +106 -0
  81. package/src/tokenizer/generic.rs +511 -0
  82. package/src/tokenizer/hash.rs +27 -0
  83. package/src/tokenizer/ignore.rs +33 -0
  84. package/src/tokenizer/line_index.rs +33 -0
  85. package/src/tokenizer/markdown.rs +289 -0
  86. package/src/tokenizer/markup_attrs.rs +289 -0
  87. package/src/tokenizer/oxc/fallback.rs +275 -0
  88. package/src/tokenizer/oxc/jsx.rs +168 -0
  89. package/src/tokenizer/oxc/kind.rs +177 -0
  90. package/src/tokenizer/oxc/lexical.rs +67 -0
  91. package/src/tokenizer/oxc.rs +659 -0
  92. package/src/tokenizer/scan.rs +88 -0
  93. package/src/tokenizer/tap.rs +150 -0
  94. package/src/tokenizer/tests.rs +915 -0
  95. package/src/tokenizer.rs +328 -0
  96. package/src/verbose.rs +195 -0
@@ -0,0 +1,106 @@
1
+ use crate::cli::Options;
2
+
3
+ use super::generic::{generic_comment_span_end, scan_punctuation_split_token};
4
+ use super::{ByteSpan, DetectionToken, LineIndex, Location, TokenContext, TokenKind, push_token};
5
+
6
+ pub(super) fn blank_ranges_preserve_newlines(content: &str, ranges: &[[usize; 2]]) -> String {
7
+ if ranges.is_empty() {
8
+ return content.to_string();
9
+ }
10
+ let mut bytes = content.as_bytes().to_vec();
11
+ for [start, end] in ranges {
12
+ for byte in &mut bytes[*start..(*end).min(content.len())] {
13
+ if !matches!(*byte, b'\n' | b'\r') {
14
+ *byte = b' ';
15
+ }
16
+ }
17
+ }
18
+ String::from_utf8(bytes).unwrap_or_else(|_| content.to_string())
19
+ }
20
+
21
+ pub(super) fn offset_tokens(
22
+ tokens: &mut [DetectionToken],
23
+ offset: usize,
24
+ start_location: &Location,
25
+ ) {
26
+ for token in tokens {
27
+ offset_location(&mut token.start, offset, start_location);
28
+ offset_location(&mut token.end, offset, start_location);
29
+ token.range[0] += offset;
30
+ token.range[1] += offset;
31
+ }
32
+ }
33
+
34
+ pub(super) fn assign_sequential_positions(tokens: &mut [DetectionToken]) {
35
+ for (position, token) in tokens.iter_mut().enumerate() {
36
+ token.start.position = position;
37
+ token.end.position = position;
38
+ }
39
+ }
40
+
41
+ pub(super) fn tokenize_generic_with_whitespace(
42
+ content: &str,
43
+ format: &str,
44
+ options: &Options,
45
+ ignore_regions: &[[usize; 2]],
46
+ ) -> Vec<DetectionToken> {
47
+ let context = TokenContext {
48
+ content,
49
+ options,
50
+ ignore_regions,
51
+ };
52
+ let line_index = LineIndex::new(content);
53
+ let mut tokens = Vec::new();
54
+ let mut start_byte = 0usize;
55
+
56
+ while start_byte < content.len() {
57
+ let ch = content[start_byte..].chars().next().unwrap_or('\0');
58
+ let (end_byte, kind) = if ch.is_whitespace() {
59
+ (scan_whitespace(content, start_byte), TokenKind::Default)
60
+ } else if let Some(comment_end) =
61
+ generic_comment_span_end(content, format, start_byte, content.len())
62
+ {
63
+ (comment_end, TokenKind::Comment)
64
+ } else {
65
+ scan_punctuation_split_token(content, format, start_byte)
66
+ };
67
+ push_token(
68
+ &mut tokens,
69
+ &context,
70
+ kind,
71
+ ByteSpan {
72
+ start: start_byte,
73
+ end: end_byte,
74
+ },
75
+ line_index.location(start_byte),
76
+ line_index.location(end_byte),
77
+ );
78
+ start_byte = end_byte.max(start_byte + ch.len_utf8());
79
+ }
80
+
81
+ tokens
82
+ }
83
+
84
+ fn offset_location(location: &mut Location, offset: usize, start_location: &Location) {
85
+ if location.line == 1 {
86
+ location.column += start_location.column.saturating_sub(1);
87
+ }
88
+ location.line += start_location.line.saturating_sub(1);
89
+ location.position += offset;
90
+ }
91
+
92
+ fn scan_whitespace(content: &str, start: usize) -> usize {
93
+ let bytes = content.as_bytes();
94
+ if bytes[start] == b'\n' {
95
+ return start + 1;
96
+ }
97
+ let mut end = start;
98
+ while end < content.len() {
99
+ let ch = content[end..].chars().next().unwrap_or('\0');
100
+ if ch == '\n' || !ch.is_whitespace() {
101
+ break;
102
+ }
103
+ end += ch.len_utf8();
104
+ }
105
+ end
106
+ }
@@ -0,0 +1,511 @@
1
+ use crate::cli::{Mode, Options};
2
+
3
+ use super::scan::scan_block_comment;
4
+ use super::{
5
+ ByteSpan, DetectionToken, LineIndex, TokenContext, TokenKind, push_strict_whitespace_tokens,
6
+ push_token,
7
+ };
8
+
9
+ pub(super) fn tokenize_generic(
10
+ content: &str,
11
+ format: &str,
12
+ options: &Options,
13
+ ignore_regions: &[[usize; 2]],
14
+ ) -> Vec<DetectionToken> {
15
+ let context = TokenContext {
16
+ content,
17
+ options,
18
+ ignore_regions,
19
+ };
20
+ let line_index = LineIndex::new(content);
21
+ let mut tokens = Vec::new();
22
+ let mut start_byte = 0usize;
23
+
24
+ while start_byte < content.len() {
25
+ let ch = content[start_byte..].chars().next().unwrap_or('\0');
26
+ if ch.is_whitespace() {
27
+ let whitespace_end = scan_whitespace(content, start_byte);
28
+ if options.mode == Mode::Strict {
29
+ push_strict_whitespace_tokens(
30
+ &mut tokens,
31
+ &context,
32
+ ByteSpan {
33
+ start: start_byte,
34
+ end: whitespace_end,
35
+ },
36
+ &line_index,
37
+ );
38
+ } else if format == "twig"
39
+ && twig_keeps_mild_whitespace(content, start_byte, whitespace_end)
40
+ {
41
+ // Prism's Twig grammar labels these spans as `default`, so
42
+ // upstream mild mode keeps them while filtering empty/new_line.
43
+ push_token(
44
+ &mut tokens,
45
+ &context,
46
+ TokenKind::Default,
47
+ ByteSpan {
48
+ start: start_byte,
49
+ end: whitespace_end,
50
+ },
51
+ line_index.location(start_byte),
52
+ line_index.location(whitespace_end),
53
+ );
54
+ }
55
+ start_byte = whitespace_end.max(start_byte + ch.len_utf8());
56
+ continue;
57
+ }
58
+
59
+ let (end_byte, kind) = if let Some((special_end, special_kind)) =
60
+ generic_multiline_span_end(content, format, start_byte, content.len())
61
+ {
62
+ (special_end, special_kind)
63
+ } else if let Some(comment_end) =
64
+ generic_comment_span_end(content, format, start_byte, content.len())
65
+ {
66
+ (comment_end, TokenKind::Comment)
67
+ } else if format == "yaml" && matches!(ch, '"' | '\'') {
68
+ (scan_quoted_string(content, start_byte), TokenKind::String)
69
+ } else if punctuation_split_format(format) {
70
+ scan_punctuation_split_token(content, format, start_byte)
71
+ } else {
72
+ (scan_generic_token(content, start_byte), TokenKind::Default)
73
+ };
74
+ push_token(
75
+ &mut tokens,
76
+ &context,
77
+ kind,
78
+ ByteSpan {
79
+ start: start_byte,
80
+ end: end_byte,
81
+ },
82
+ line_index.location(start_byte),
83
+ line_index.location(end_byte),
84
+ );
85
+ start_byte = end_byte.max(start_byte + ch.len_utf8());
86
+ }
87
+
88
+ tokens
89
+ }
90
+
91
+ pub(super) fn scan_generic_token(content: &str, start: usize) -> usize {
92
+ let mut end = start;
93
+ while end < content.len() {
94
+ let ch = content[end..].chars().next().unwrap_or('\0');
95
+ if ch.is_whitespace() {
96
+ break;
97
+ }
98
+ end += ch.len_utf8();
99
+ }
100
+ end
101
+ }
102
+
103
+ pub(super) fn scan_punctuation_split_token(
104
+ content: &str,
105
+ format: &str,
106
+ start: usize,
107
+ ) -> (usize, TokenKind) {
108
+ let ch = content[start..].chars().next().unwrap_or('\0');
109
+ if is_split_punctuation(format, ch) {
110
+ return (start + ch.len_utf8(), TokenKind::Punctuation);
111
+ }
112
+ if code_like_format(format) && is_operator_start(ch) {
113
+ return (scan_operator_token(content, start), TokenKind::Operator);
114
+ }
115
+
116
+ let mut end = start;
117
+ while end < content.len() {
118
+ let ch = content[end..].chars().next().unwrap_or('\0');
119
+ if ch.is_whitespace()
120
+ || is_split_punctuation(format, ch)
121
+ || (code_like_format(format) && is_operator_start(ch))
122
+ {
123
+ break;
124
+ }
125
+ end += ch.len_utf8();
126
+ }
127
+ (end, TokenKind::Default)
128
+ }
129
+
130
+ fn scan_operator_token(content: &str, start: usize) -> usize {
131
+ let mut end = start;
132
+ while end < content.len() {
133
+ let ch = content[end..].chars().next().unwrap_or('\0');
134
+ if !is_operator_start(ch) {
135
+ break;
136
+ }
137
+ end += ch.len_utf8();
138
+ }
139
+ end
140
+ }
141
+
142
+ fn scan_quoted_string(content: &str, start: usize) -> usize {
143
+ let quote = content[start..].chars().next().unwrap_or('\0');
144
+ let mut escaped = false;
145
+ let mut end = start + quote.len_utf8();
146
+ while end < content.len() {
147
+ let ch = content[end..].chars().next().unwrap_or('\0');
148
+ end += ch.len_utf8();
149
+ if escaped {
150
+ escaped = false;
151
+ continue;
152
+ }
153
+ if ch == '\\' {
154
+ escaped = true;
155
+ continue;
156
+ }
157
+ if ch == quote || matches!(ch, '\n' | '\r') {
158
+ break;
159
+ }
160
+ }
161
+ end
162
+ }
163
+
164
+ fn generic_multiline_span_end(
165
+ content: &str,
166
+ format: &str,
167
+ start: usize,
168
+ limit: usize,
169
+ ) -> Option<(usize, TokenKind)> {
170
+ match format {
171
+ "haml" => haml_multiline_comment_span_end(content, start, limit)
172
+ .map(|end| (end, TokenKind::Comment)),
173
+ "pug" => pug_dot_block_span_end(content, start, limit).map(|end| (end, TokenKind::Default)),
174
+ _ => None,
175
+ }
176
+ }
177
+
178
+ fn haml_multiline_comment_span_end(content: &str, start: usize, limit: usize) -> Option<usize> {
179
+ let bytes = content.as_bytes();
180
+ let line_start = line_start(bytes, start);
181
+ if !line_prefix_is_indent(bytes, line_start, start) {
182
+ return None;
183
+ }
184
+
185
+ let rest = &bytes[start..limit];
186
+ if !(rest.starts_with(b"-#") || rest.starts_with(b"/")) {
187
+ return None;
188
+ }
189
+
190
+ Some(scan_indented_block_end(
191
+ bytes, line_start, start, limit, false,
192
+ ))
193
+ }
194
+
195
+ fn pug_dot_block_span_end(content: &str, start: usize, limit: usize) -> Option<usize> {
196
+ let bytes = content.as_bytes();
197
+ let line_start = line_start(bytes, start);
198
+ if !line_prefix_is_indent(bytes, line_start, start) {
199
+ return None;
200
+ }
201
+
202
+ let line_end = line_content_end(bytes, start, limit);
203
+ if !is_pug_dot_block_opener(&content[start..line_end]) {
204
+ return None;
205
+ }
206
+
207
+ let end = scan_indented_block_end(bytes, line_start, start, limit, true);
208
+ (end > line_end).then_some(end)
209
+ }
210
+
211
+ fn scan_indented_block_end(
212
+ bytes: &[u8],
213
+ line_start: usize,
214
+ start: usize,
215
+ limit: usize,
216
+ include_blank_lines: bool,
217
+ ) -> usize {
218
+ let base_indent = start.saturating_sub(line_start);
219
+ let mut end = line_content_end(bytes, start, limit);
220
+ let mut next_start = next_line_start(bytes, end, limit);
221
+
222
+ while next_start < limit {
223
+ let line_end = line_content_end(bytes, next_start, limit);
224
+ let indent_end = scan_indent(bytes, next_start, line_end);
225
+ let is_blank = indent_end == line_end;
226
+ let is_child = indent_end.saturating_sub(next_start) > base_indent;
227
+ if is_child || (include_blank_lines && is_blank) {
228
+ end = line_end;
229
+ next_start = next_line_start(bytes, line_end, limit);
230
+ } else {
231
+ break;
232
+ }
233
+ }
234
+
235
+ end
236
+ }
237
+
238
+ fn is_pug_dot_block_opener(line: &str) -> bool {
239
+ let trimmed = line.trim_end_matches([' ', '\t']);
240
+ let Some(head) = trimmed.strip_suffix('.') else {
241
+ return false;
242
+ };
243
+ !head.eq_ignore_ascii_case("script")
244
+ && !head.is_empty()
245
+ && head
246
+ .bytes()
247
+ .all(|byte| byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'#' | b'.'))
248
+ }
249
+
250
+ fn is_split_punctuation(format: &str, ch: char) -> bool {
251
+ matches!(ch, '{' | '}' | '(' | ')' | '[' | ']' | ':' | ';' | ',')
252
+ || (code_like_format(format) && ch == '.')
253
+ }
254
+
255
+ fn is_operator_start(ch: char) -> bool {
256
+ matches!(
257
+ ch,
258
+ '+' | '-' | '*' | '/' | '%' | '=' | '!' | '<' | '>' | '&' | '|' | '^' | '~' | '?'
259
+ )
260
+ }
261
+
262
+ pub(super) fn scan_whitespace(content: &str, start: usize) -> usize {
263
+ let mut end = start;
264
+ while end < content.len() {
265
+ let ch = content[end..].chars().next().unwrap_or('\0');
266
+ if !ch.is_whitespace() {
267
+ break;
268
+ }
269
+ end += ch.len_utf8();
270
+ }
271
+ end
272
+ }
273
+
274
+ fn twig_keeps_mild_whitespace(content: &str, start: usize, end: usize) -> bool {
275
+ if start >= end {
276
+ return false;
277
+ }
278
+
279
+ let has_newline = content[start..end].bytes().any(|byte| byte == b'\n');
280
+ if !has_newline {
281
+ return previous_non_whitespace(content, start).is_some()
282
+ && next_non_whitespace(content, end).is_some();
283
+ }
284
+
285
+ matches!(
286
+ (
287
+ previous_non_whitespace(content, start),
288
+ next_non_whitespace(content, end)
289
+ ),
290
+ (Some(b'>'), Some(b'<'))
291
+ )
292
+ }
293
+
294
+ fn previous_non_whitespace(content: &str, end: usize) -> Option<u8> {
295
+ content[..end]
296
+ .bytes()
297
+ .rev()
298
+ .find(|byte| !byte.is_ascii_whitespace())
299
+ }
300
+
301
+ fn next_non_whitespace(content: &str, start: usize) -> Option<u8> {
302
+ content[start..]
303
+ .bytes()
304
+ .find(|byte| !byte.is_ascii_whitespace())
305
+ }
306
+
307
+ pub(super) fn generic_comment_span_end(
308
+ content: &str,
309
+ format: &str,
310
+ start: usize,
311
+ limit: usize,
312
+ ) -> Option<usize> {
313
+ let bytes = content.as_bytes();
314
+ let rest = &bytes[start..limit];
315
+ if rest.starts_with(b"<!--") {
316
+ return Some(scan_html_comment(bytes, start, limit));
317
+ }
318
+ if rest.starts_with(b"/*") {
319
+ return Some(scan_block_comment(bytes, start, limit));
320
+ }
321
+ if rest.starts_with(b"//") {
322
+ return Some(scan_to_line_end(bytes, start, limit));
323
+ }
324
+ if rest.starts_with(b"--") && generic_double_dash_comment_format(format) {
325
+ return Some(scan_to_line_end(bytes, start, limit));
326
+ }
327
+ if bytes[start] == b'#' && generic_hash_comment_format(format) {
328
+ return Some(scan_to_line_end(bytes, start, limit));
329
+ }
330
+ if bytes[start] == b';' && generic_semicolon_comment_format(format) {
331
+ return Some(scan_to_line_end(bytes, start, limit));
332
+ }
333
+ None
334
+ }
335
+
336
+ fn generic_hash_comment_format(format: &str) -> bool {
337
+ matches!(
338
+ format,
339
+ "apacheconf"
340
+ | "applescript"
341
+ | "bash"
342
+ | "cmake"
343
+ | "docker"
344
+ | "editorconfig"
345
+ | "git"
346
+ | "ignore"
347
+ | "ini"
348
+ | "julia"
349
+ | "makefile"
350
+ | "nginx"
351
+ | "nix"
352
+ | "perl"
353
+ | "powershell"
354
+ | "properties"
355
+ | "python"
356
+ | "r"
357
+ | "ruby"
358
+ | "shell-session"
359
+ | "tcl"
360
+ | "toml"
361
+ | "vim"
362
+ | "yaml"
363
+ )
364
+ }
365
+
366
+ fn generic_double_dash_comment_format(format: &str) -> bool {
367
+ matches!(
368
+ format,
369
+ "ada" | "applescript" | "elm" | "haskell" | "lua" | "plsql" | "sql"
370
+ )
371
+ }
372
+
373
+ fn generic_semicolon_comment_format(format: &str) -> bool {
374
+ matches!(
375
+ format,
376
+ "asm6502"
377
+ | "autoit"
378
+ | "autohotkey"
379
+ | "clojure"
380
+ | "ini"
381
+ | "lisp"
382
+ | "llvm"
383
+ | "nasm"
384
+ | "racket"
385
+ | "scheme"
386
+ )
387
+ }
388
+
389
+ fn punctuation_split_format(format: &str) -> bool {
390
+ css_like_format(format) || code_like_format(format)
391
+ }
392
+
393
+ fn css_like_format(format: &str) -> bool {
394
+ matches!(format, "css" | "less" | "sass" | "scss" | "stylus")
395
+ }
396
+
397
+ fn code_like_format(format: &str) -> bool {
398
+ matches!(
399
+ format,
400
+ "ada"
401
+ | "apex"
402
+ | "aspnet"
403
+ | "c"
404
+ | "c-header"
405
+ | "clike"
406
+ | "clojure"
407
+ | "cmake"
408
+ | "coffeescript"
409
+ | "cpp"
410
+ | "cpp-header"
411
+ | "csharp"
412
+ | "csv"
413
+ | "cfml"
414
+ | "cfscript"
415
+ | "dart"
416
+ | "dot"
417
+ | "eiffel"
418
+ | "go"
419
+ | "haml"
420
+ | "ini"
421
+ | "java"
422
+ | "kotlin"
423
+ | "haxe"
424
+ | "markup"
425
+ | "objectivec"
426
+ | "ocaml"
427
+ | "perl"
428
+ | "php"
429
+ | "plsql"
430
+ | "properties"
431
+ | "purescript"
432
+ | "python"
433
+ | "qsharp"
434
+ | "r"
435
+ | "rescript"
436
+ | "robotframework"
437
+ | "rust"
438
+ | "scala"
439
+ | "solidity"
440
+ | "sparql"
441
+ | "swift"
442
+ | "tcl"
443
+ | "tt2"
444
+ | "turtle"
445
+ | "twig"
446
+ | "verilog"
447
+ | "wgsl"
448
+ | "yaml"
449
+ | "zig"
450
+ )
451
+ }
452
+
453
+ fn scan_to_line_end(bytes: &[u8], start: usize, limit: usize) -> usize {
454
+ let mut idx = start;
455
+ while idx < limit && bytes[idx] != b'\n' {
456
+ idx += 1;
457
+ }
458
+ idx
459
+ }
460
+
461
+ fn line_start(bytes: &[u8], start: usize) -> usize {
462
+ let mut idx = start;
463
+ while idx > 0 && !matches!(bytes[idx - 1], b'\n' | b'\r') {
464
+ idx -= 1;
465
+ }
466
+ idx
467
+ }
468
+
469
+ fn line_prefix_is_indent(bytes: &[u8], line_start: usize, start: usize) -> bool {
470
+ bytes[line_start..start]
471
+ .iter()
472
+ .all(|byte| matches!(byte, b' ' | b'\t'))
473
+ }
474
+
475
+ fn line_content_end(bytes: &[u8], start: usize, limit: usize) -> usize {
476
+ let mut idx = start;
477
+ while idx < limit && !matches!(bytes[idx], b'\n' | b'\r') {
478
+ idx += 1;
479
+ }
480
+ idx
481
+ }
482
+
483
+ fn next_line_start(bytes: &[u8], line_end: usize, limit: usize) -> usize {
484
+ if line_end >= limit {
485
+ return limit;
486
+ }
487
+ if bytes[line_end] == b'\r' && line_end + 1 < limit && bytes[line_end + 1] == b'\n' {
488
+ line_end + 2
489
+ } else {
490
+ line_end + 1
491
+ }
492
+ }
493
+
494
+ fn scan_indent(bytes: &[u8], start: usize, limit: usize) -> usize {
495
+ let mut idx = start;
496
+ while idx < limit && matches!(bytes[idx], b' ' | b'\t') {
497
+ idx += 1;
498
+ }
499
+ idx
500
+ }
501
+
502
+ fn scan_html_comment(bytes: &[u8], start: usize, limit: usize) -> usize {
503
+ let mut idx = start + 4;
504
+ while idx + 2 < limit {
505
+ if bytes[idx] == b'-' && bytes[idx + 1] == b'-' && bytes[idx + 2] == b'>' {
506
+ return idx + 3;
507
+ }
508
+ idx += 1;
509
+ }
510
+ limit
511
+ }
@@ -0,0 +1,27 @@
1
+ use xxhash_rust::xxh3::xxh3_64;
2
+
3
+ use super::TokenKind;
4
+
5
+ pub(super) fn hash_token(kind: TokenKind, value: &str, ignore_case: bool) -> u64 {
6
+ let kind_hash = match kind {
7
+ TokenKind::Comment => 0x01_u64,
8
+ TokenKind::Constant => 0x08_u64,
9
+ TokenKind::Empty => 0x09_u64,
10
+ TokenKind::Keyword => 0x02_u64,
11
+ TokenKind::NewLine => 0x0a_u64,
12
+ TokenKind::Number => 0x03_u64,
13
+ TokenKind::Operator => 0x04_u64,
14
+ TokenKind::Punctuation => 0x05_u64,
15
+ TokenKind::String => 0x06_u64,
16
+ TokenKind::Default => 0x07_u64,
17
+ };
18
+ hash_value(value, ignore_case) ^ kind_hash
19
+ }
20
+
21
+ fn hash_value(value: &str, ignore_case: bool) -> u64 {
22
+ if ignore_case {
23
+ xxh3_64(value.to_lowercase().as_bytes())
24
+ } else {
25
+ xxh3_64(value.as_bytes())
26
+ }
27
+ }
@@ -0,0 +1,33 @@
1
+ use crate::cli::Options;
2
+
3
+ pub(super) fn find_ignore_regions(content: &str, options: &Options) -> Vec<[usize; 2]> {
4
+ let mut regions = Vec::new();
5
+ let start_marker = "jscpd:ignore-start";
6
+ let end_marker = "jscpd:ignore-end";
7
+ let mut search_from = 0;
8
+
9
+ while let Some(marker_start) = content[search_from..].find(start_marker) {
10
+ let marker_start = search_from + marker_start;
11
+ let line_start = content[..marker_start]
12
+ .rfind('\n')
13
+ .map(|idx| idx + 1)
14
+ .unwrap_or(0);
15
+ let after_start = marker_start + start_marker.len();
16
+ let Some(marker_end_rel) = content[after_start..].find(end_marker) else {
17
+ break;
18
+ };
19
+ let marker_end = after_start + marker_end_rel;
20
+ let line_end = content[marker_end..]
21
+ .find('\n')
22
+ .map(|idx| marker_end + idx)
23
+ .unwrap_or(content.len());
24
+ regions.push([line_start, line_end]);
25
+ search_from = line_end;
26
+ }
27
+
28
+ for pattern in &options.ignore_pattern {
29
+ regions.extend(pattern.find_iter(content).map(|m| [m.start(), m.end()]));
30
+ }
31
+
32
+ regions
33
+ }
@@ -0,0 +1,33 @@
1
+ use super::Location;
2
+
3
+ pub(super) struct LineIndex {
4
+ newlines: Vec<usize>,
5
+ }
6
+
7
+ impl LineIndex {
8
+ pub(super) fn new(content: &str) -> Self {
9
+ Self {
10
+ newlines: content
11
+ .bytes()
12
+ .enumerate()
13
+ .filter_map(|(idx, byte)| (byte == b'\n').then_some(idx))
14
+ .collect(),
15
+ }
16
+ }
17
+
18
+ pub(super) fn location(&self, offset: usize) -> Location {
19
+ let previous_newlines = self
20
+ .newlines
21
+ .partition_point(|newline_offset| *newline_offset < offset);
22
+ let line_start = if previous_newlines == 0 {
23
+ 0
24
+ } else {
25
+ self.newlines[previous_newlines - 1] + 1
26
+ };
27
+ Location {
28
+ line: previous_newlines + 1,
29
+ column: offset - line_start + 1,
30
+ position: offset,
31
+ }
32
+ }
33
+ }