jscpd-rs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/CHANGELOG.md +69 -0
  2. package/Cargo.lock +1323 -0
  3. package/Cargo.toml +54 -0
  4. package/LICENSE +21 -0
  5. package/README.md +372 -0
  6. package/docs/api-parity.md +49 -0
  7. package/docs/cloning-plan.md +281 -0
  8. package/docs/compat-baseline.md +535 -0
  9. package/docs/format-porting.md +86 -0
  10. package/docs/junior-task-template.md +62 -0
  11. package/docs/junior-workflow.md +87 -0
  12. package/docs/migrating-from-jscpd.md +193 -0
  13. package/docs/npm-release.md +116 -0
  14. package/docs/public-benchmark-suite.md +81 -0
  15. package/docs/release-checklist.md +200 -0
  16. package/docs/release-decisions.md +103 -0
  17. package/docs/release-readiness.md +51 -0
  18. package/docs/upstream-bugs.md +501 -0
  19. package/docs/upstream-issue-drafts.md +393 -0
  20. package/docs/user-guide.md +309 -0
  21. package/examples/dump_oxc_tokens.rs +112 -0
  22. package/examples/library_api.rs +42 -0
  23. package/npm/bin/jscpd-rs.js +6 -0
  24. package/npm/bin/jscpd-server.js +6 -0
  25. package/npm/lib/run-binary.js +68 -0
  26. package/npm/scripts/postinstall.js +50 -0
  27. package/package.json +53 -0
  28. package/skills/dry-refactoring/SKILL.md +63 -0
  29. package/skills/jscpd/SKILL.md +85 -0
  30. package/src/app.rs +512 -0
  31. package/src/bin/jscpd-server.rs +429 -0
  32. package/src/blame.rs +130 -0
  33. package/src/cli/config.rs +543 -0
  34. package/src/cli/parsing.rs +301 -0
  35. package/src/cli/tests.rs +543 -0
  36. package/src/cli.rs +671 -0
  37. package/src/detector/matching/secondary.rs +387 -0
  38. package/src/detector/matching.rs +274 -0
  39. package/src/detector/model.rs +190 -0
  40. package/src/detector/prepare.rs +71 -0
  41. package/src/detector/skip_local.rs +40 -0
  42. package/src/detector/statistics.rs +138 -0
  43. package/src/detector/store.rs +96 -0
  44. package/src/detector/tests.rs +238 -0
  45. package/src/detector.rs +265 -0
  46. package/src/files/discovery.rs +508 -0
  47. package/src/files/gitignore.rs +203 -0
  48. package/src/files/paths.rs +68 -0
  49. package/src/files/shebang.rs +106 -0
  50. package/src/files/tests.rs +523 -0
  51. package/src/files.rs +25 -0
  52. package/src/formats.rs +570 -0
  53. package/src/lib.rs +433 -0
  54. package/src/main.rs +26 -0
  55. package/src/report/ai.rs +125 -0
  56. package/src/report/badge.rs +238 -0
  57. package/src/report/console.rs +180 -0
  58. package/src/report/console_common.rs +37 -0
  59. package/src/report/console_full.rs +139 -0
  60. package/src/report/csv.rs +65 -0
  61. package/src/report/escape.rs +8 -0
  62. package/src/report/file_output.rs +28 -0
  63. package/src/report/html/assets.rs +47 -0
  64. package/src/report/html.rs +336 -0
  65. package/src/report/json.rs +119 -0
  66. package/src/report/markdown.rs +125 -0
  67. package/src/report/sarif.rs +302 -0
  68. package/src/report/silent.rs +22 -0
  69. package/src/report/source.rs +38 -0
  70. package/src/report/summary.rs +50 -0
  71. package/src/report/test_support.rs +133 -0
  72. package/src/report/threshold.rs +76 -0
  73. package/src/report/xcode.rs +90 -0
  74. package/src/report/xml.rs +119 -0
  75. package/src/report.rs +250 -0
  76. package/src/server/mcp.rs +942 -0
  77. package/src/server.rs +1081 -0
  78. package/src/tokenizer/apex.rs +97 -0
  79. package/src/tokenizer/blocks.rs +532 -0
  80. package/src/tokenizer/embedded.rs +106 -0
  81. package/src/tokenizer/generic.rs +511 -0
  82. package/src/tokenizer/hash.rs +27 -0
  83. package/src/tokenizer/ignore.rs +33 -0
  84. package/src/tokenizer/line_index.rs +33 -0
  85. package/src/tokenizer/markdown.rs +289 -0
  86. package/src/tokenizer/markup_attrs.rs +289 -0
  87. package/src/tokenizer/oxc/fallback.rs +275 -0
  88. package/src/tokenizer/oxc/jsx.rs +168 -0
  89. package/src/tokenizer/oxc/kind.rs +177 -0
  90. package/src/tokenizer/oxc/lexical.rs +67 -0
  91. package/src/tokenizer/oxc.rs +659 -0
  92. package/src/tokenizer/scan.rs +88 -0
  93. package/src/tokenizer/tap.rs +150 -0
  94. package/src/tokenizer/tests.rs +915 -0
  95. package/src/tokenizer.rs +328 -0
  96. package/src/verbose.rs +195 -0
@@ -0,0 +1,238 @@
1
+ use std::collections::BTreeSet;
2
+
3
+ use crate::cli::Options;
4
+ use crate::files::SourceFile;
5
+ use crate::tokenizer::Location;
6
+
7
+ use super::{
8
+ CloneMatch, Fragment, dedup_exact_clones, detect, detect_prepared_drafts, prepare_source_drafts,
9
+ };
10
+
11
+ fn detection_options() -> Options {
12
+ Options {
13
+ min_tokens: 3,
14
+ min_lines: 0,
15
+ ..Options::default()
16
+ }
17
+ }
18
+
19
+ #[test]
20
+ fn detects_cross_file_duplicates() {
21
+ let content = "alpha beta gamma delta epsilon\n";
22
+ let files = vec![
23
+ source("a.js", content),
24
+ source("b.js", &format!("prefix\n{content}\nsuffix\n")),
25
+ ];
26
+
27
+ let result = detect(files, &detection_options());
28
+
29
+ assert!(!result.clones.is_empty());
30
+ }
31
+
32
+ #[test]
33
+ fn detects_generic_format_duplicates() {
34
+ let content = "alpha beta gamma delta epsilon\n";
35
+ let files = vec![
36
+ source_with_format("a.css", "css", content),
37
+ source_with_format("b.css", "css", &format!("prefix\n{content}\nsuffix\n")),
38
+ ];
39
+
40
+ let result = detect(files, &detection_options());
41
+
42
+ assert!(!result.clones.is_empty());
43
+ }
44
+
45
+ #[test]
46
+ fn skip_local_skips_clones_inside_same_configured_root() {
47
+ let options = Options {
48
+ paths: vec!["project".into()],
49
+ skip_local: true,
50
+ min_tokens: 3,
51
+ min_lines: 0,
52
+ ..Options::default()
53
+ };
54
+ let content = "alpha beta gamma delta epsilon\n";
55
+ let files = vec![
56
+ source("project/dir1/a.js", content),
57
+ source("project/dir2/b.js", content),
58
+ ];
59
+
60
+ let result = detect(files, &options);
61
+
62
+ assert!(result.clones.is_empty());
63
+ }
64
+
65
+ #[test]
66
+ fn skip_local_keeps_clones_across_configured_roots() {
67
+ let options = Options {
68
+ paths: vec!["left".into(), "right".into()],
69
+ skip_local: true,
70
+ min_tokens: 3,
71
+ min_lines: 0,
72
+ ..Options::default()
73
+ };
74
+ let content = "alpha beta gamma delta epsilon\n";
75
+ let files = vec![source("left/a.js", content), source("right/b.js", content)];
76
+
77
+ let result = detect(files, &options);
78
+
79
+ assert!(!result.clones.is_empty());
80
+ }
81
+
82
+ #[test]
83
+ fn skips_empty_token_sources_in_statistics() {
84
+ let content = "// jscpd:ignore-start\nignored\n// jscpd:ignore-end\n";
85
+
86
+ let result = detect(vec![source("ignored.js", content)], &Options::default());
87
+
88
+ assert_eq!(result.sources.len(), 0);
89
+ assert_eq!(result.statistics.total.sources, 0);
90
+ }
91
+
92
+ #[test]
93
+ fn prepared_drafts_detection_matches_direct_detection() {
94
+ let options = Options {
95
+ reporters: vec!["json".to_string()],
96
+ ..detection_options()
97
+ };
98
+ let content = "alpha beta gamma delta epsilon\n";
99
+ let files = vec![
100
+ source("a.js", content),
101
+ source("b.js", &format!("prefix\n{content}\nsuffix\n")),
102
+ ];
103
+
104
+ let direct = detect(files.clone(), &options);
105
+ let prepared = detect_prepared_drafts(prepare_source_drafts(files, &options), &options);
106
+
107
+ assert_eq!(prepared.clones.len(), direct.clones.len());
108
+ assert_eq!(
109
+ prepared.statistics.total.sources,
110
+ direct.statistics.total.sources
111
+ );
112
+ assert_eq!(
113
+ prepared.statistics.total.clones,
114
+ direct.statistics.total.clones
115
+ );
116
+ assert_eq!(
117
+ prepared.source_contents.keys().collect::<BTreeSet<_>>(),
118
+ direct.source_contents.keys().collect::<BTreeSet<_>>()
119
+ );
120
+ }
121
+
122
+ #[test]
123
+ fn detects_typescript_template_tail_clone_before_member_name_difference() {
124
+ let options = Options {
125
+ min_tokens: 50,
126
+ min_lines: 5,
127
+ ..Options::default()
128
+ };
129
+ let content = r#"
130
+ function first(workUnitAsyncStorage, reportResult) {
131
+ console.log = function (...args: Array<any>) {
132
+ const store = workUnitAsyncStorage.getStore()
133
+ reportResult({
134
+ type: 'console-call',
135
+ method: 'log',
136
+ input: `${store ? '[Store]' : '[No Store]'}: ${args.join(' ')}`,
137
+ })
138
+ }
139
+
140
+ require('next/dist/server/node-environment-extensions/console-exit')
141
+
142
+ workUnitAsyncStorage.run({ type: 'request' } as WorkUnitStore, () => {
143
+ console.log('inside')
144
+ })
145
+ }
146
+
147
+ function second(workUnitAsyncStorage, reportResult) {
148
+ console.error = function (...args: Array<any>) {
149
+ const store = workUnitAsyncStorage.getStore()
150
+ reportResult({
151
+ type: 'console-call',
152
+ method: 'error',
153
+ input: `${store ? '[Store]' : '[No Store]'}: ${args.join(' ')}`,
154
+ })
155
+ }
156
+
157
+ require('next/dist/server/node-environment-extensions/console-exit')
158
+
159
+ workUnitAsyncStorage.run({ type: 'request' } as WorkUnitStore, () => {
160
+ console.error('inside')
161
+ })
162
+ }
163
+ "#;
164
+
165
+ let result = detect(
166
+ vec![source_with_format("console.ts", "typescript", content)],
167
+ &options,
168
+ );
169
+
170
+ assert!(result.clones.iter().any(|clone| {
171
+ clone.duplication_a.start.line <= 24
172
+ && clone.duplication_a.end.line >= 32
173
+ && clone.duplication_b.start.line <= 7
174
+ && clone.duplication_b.end.line >= 15
175
+ }));
176
+ }
177
+
178
+ #[test]
179
+ fn deduplicates_exact_clone_records() {
180
+ let mut clones = vec![
181
+ clone_with_lines("javascript", "a.js", 1, 5, "b.js", 1, 5),
182
+ clone_with_lines("javascript", "a.js", 1, 5, "b.js", 1, 5),
183
+ clone_with_lines("javascript", "a.js", 6, 10, "b.js", 6, 10),
184
+ ];
185
+
186
+ dedup_exact_clones(&mut clones);
187
+
188
+ assert_eq!(clones.len(), 2);
189
+ assert_eq!(clones[0].duplication_a.start.line, 1);
190
+ assert_eq!(clones[1].duplication_a.start.line, 6);
191
+ }
192
+
193
+ fn source(path: &str, content: &str) -> SourceFile {
194
+ source_with_format(path, "javascript", content)
195
+ }
196
+
197
+ fn source_with_format(path: &str, format: &str, content: &str) -> SourceFile {
198
+ SourceFile {
199
+ source_id: path.to_string(),
200
+ format: format.to_string(),
201
+ content: content.to_string(),
202
+ }
203
+ }
204
+
205
+ fn clone_with_lines(
206
+ format: &str,
207
+ source_a: &str,
208
+ start_a: usize,
209
+ end_a: usize,
210
+ source_b: &str,
211
+ start_b: usize,
212
+ end_b: usize,
213
+ ) -> CloneMatch {
214
+ CloneMatch {
215
+ format: format.to_string(),
216
+ duplication_a: fragment(source_a, start_a, end_a),
217
+ duplication_b: fragment(source_b, start_b, end_b),
218
+ tokens: 20,
219
+ }
220
+ }
221
+
222
+ fn fragment(source_id: &str, start: usize, end: usize) -> Fragment {
223
+ Fragment {
224
+ source_id: source_id.to_string(),
225
+ start: location(start, 1, start),
226
+ end: location(end, 1, end),
227
+ range: [start, end],
228
+ blame: None,
229
+ }
230
+ }
231
+
232
+ fn location(line: usize, column: usize, position: usize) -> Location {
233
+ Location {
234
+ line,
235
+ column,
236
+ position,
237
+ }
238
+ }
@@ -0,0 +1,265 @@
1
+ use std::collections::HashMap;
2
+
3
+ use rayon::prelude::*;
4
+ use rustc_hash::FxHashSet;
5
+
6
+ use crate::cli::Options;
7
+ use crate::files::SourceFile;
8
+
9
+ mod matching;
10
+ mod model;
11
+ mod prepare;
12
+ mod skip_local;
13
+ mod statistics;
14
+ mod store;
15
+ #[cfg(test)]
16
+ mod tests;
17
+
18
+ #[cfg(test)]
19
+ pub use model::FormatStatistic;
20
+ pub(crate) use model::PreparedSourceDraft;
21
+ pub use model::{
22
+ BlamedLine, BlamedLines, CloneMatch, DetectionResult, Fragment, SkippedClone, SourceSummary,
23
+ StatisticRow, Statistics,
24
+ };
25
+ pub use statistics::{Statistic, clone_lines};
26
+ pub use store::{MemoryStore, MemoryStoreError};
27
+
28
+ use matching::detect_format;
29
+ use model::{FormatId, PreparedSource, SourceId, TokenStream};
30
+ use prepare::{assign_formats, prepare_file_maps};
31
+ use statistics::{finalize_percentages, update_clone_statistics, update_source_statistics};
32
+
33
+ /// Incremental detector facade for native integrations.
34
+ ///
35
+ /// For one-shot detection, prefer `detect_source_files` or
36
+ /// `detect_clones_and_statistics`. Use this type when an integration wants to
37
+ /// keep options and previously submitted in-memory sources together.
38
+ #[derive(Clone, Debug)]
39
+ pub struct Detector {
40
+ options: Options,
41
+ sources: Vec<SourceFile>,
42
+ }
43
+
44
+ impl Detector {
45
+ /// Create an empty detector with the provided options.
46
+ pub fn new(options: Options) -> Self {
47
+ Self {
48
+ options,
49
+ sources: Vec::new(),
50
+ }
51
+ }
52
+
53
+ /// Create a detector preloaded with in-memory sources.
54
+ pub fn with_sources(options: Options, sources: Vec<SourceFile>) -> Self {
55
+ Self { options, sources }
56
+ }
57
+
58
+ /// Return detector options.
59
+ pub fn options(&self) -> &Options {
60
+ &self.options
61
+ }
62
+
63
+ /// Mutably access detector options.
64
+ pub fn options_mut(&mut self) -> &mut Options {
65
+ &mut self.options
66
+ }
67
+
68
+ /// Return sources currently held by this detector.
69
+ pub fn sources(&self) -> &[SourceFile] {
70
+ &self.sources
71
+ }
72
+
73
+ /// Remove all remembered sources.
74
+ pub fn clear(&mut self) {
75
+ self.sources.clear();
76
+ }
77
+
78
+ /// Add one source and return clones involving that new source.
79
+ pub fn detect(
80
+ &mut self,
81
+ source_id: impl Into<String>,
82
+ text: impl Into<String>,
83
+ format: impl Into<String>,
84
+ ) -> Vec<CloneMatch> {
85
+ self.detect_source_file(SourceFile {
86
+ source_id: source_id.into(),
87
+ format: format.into(),
88
+ content: text.into(),
89
+ })
90
+ }
91
+
92
+ /// Add one prepared source and return clones involving that new source.
93
+ pub fn detect_source_file(&mut self, source: SourceFile) -> Vec<CloneMatch> {
94
+ let source_id = source.source_id.clone();
95
+ self.sources.push(source);
96
+ let result = detect(self.sources.clone(), &self.options);
97
+ result
98
+ .clones
99
+ .into_iter()
100
+ .filter(|clone| {
101
+ clone.duplication_a.source_id == source_id
102
+ || clone.duplication_b.source_id == source_id
103
+ })
104
+ .collect()
105
+ }
106
+
107
+ /// Run one-shot detection against the provided prepared sources.
108
+ pub fn detect_files(&self, files: Vec<SourceFile>) -> DetectionResult {
109
+ detect(files, &self.options)
110
+ }
111
+ }
112
+
113
+ pub fn detect(files: Vec<SourceFile>, options: &Options) -> DetectionResult {
114
+ detect_prepared_drafts(prepare_source_drafts(files, options), options)
115
+ }
116
+
117
+ pub(crate) fn prepare_source_drafts(
118
+ files: Vec<SourceFile>,
119
+ options: &Options,
120
+ ) -> Vec<PreparedSourceDraft> {
121
+ files
122
+ .into_par_iter()
123
+ .map(|file| prepare_file_maps(file, options))
124
+ .collect::<Vec<_>>()
125
+ .into_iter()
126
+ .flatten()
127
+ .collect::<Vec<_>>()
128
+ }
129
+
130
+ pub(crate) fn detect_prepared_drafts(
131
+ prepared_drafts: Vec<PreparedSourceDraft>,
132
+ options: &Options,
133
+ ) -> DetectionResult {
134
+ let (format_ids, format_names) = assign_formats(&prepared_drafts);
135
+ let prepared_files = prepared_drafts
136
+ .into_iter()
137
+ .enumerate()
138
+ .map(|(idx, draft)| PreparedSource {
139
+ meta: draft.meta,
140
+ stream: TokenStream {
141
+ source_id: SourceId(idx),
142
+ format_id: format_ids[idx],
143
+ hashes: draft.hashes,
144
+ spans: draft.spans,
145
+ },
146
+ })
147
+ .collect::<Vec<_>>();
148
+
149
+ let mut statistics = Statistics::default();
150
+ let mut sources = Vec::new();
151
+ let mut source_contents = HashMap::new();
152
+ let mut source_indices_by_format = vec![Vec::new(); format_names.len()];
153
+ let include_source_contents = options
154
+ .reporters
155
+ .iter()
156
+ .any(|reporter| matches!(reporter.as_str(), "json" | "xml" | "html" | "consoleFull"));
157
+
158
+ for (idx, prepared) in prepared_files.iter().enumerate() {
159
+ if prepared.stream.spans.is_empty() {
160
+ continue;
161
+ }
162
+ update_source_statistics(
163
+ &mut statistics,
164
+ &prepared.meta.source_id,
165
+ &prepared.meta.format,
166
+ prepared.meta.lines,
167
+ prepared.meta.tokens,
168
+ );
169
+ sources.push(SourceSummary {
170
+ path: prepared.meta.source_id.clone(),
171
+ format: prepared.meta.format.clone(),
172
+ lines: prepared.meta.lines,
173
+ tokens: prepared.meta.tokens,
174
+ });
175
+ if include_source_contents {
176
+ source_contents.insert(
177
+ prepared.meta.source_id.clone(),
178
+ prepared.meta.content.clone(),
179
+ );
180
+ }
181
+ source_indices_by_format[prepared.stream.format_id.0].push(idx);
182
+ }
183
+
184
+ let format_results = source_indices_by_format
185
+ .par_iter()
186
+ .enumerate()
187
+ .map(|(format_id, source_indices)| {
188
+ detect_format(
189
+ FormatId(format_id),
190
+ source_indices,
191
+ &prepared_files,
192
+ &format_names,
193
+ options,
194
+ )
195
+ })
196
+ .collect::<Vec<_>>();
197
+
198
+ let mut clones = Vec::new();
199
+ let mut skipped_clones = Vec::new();
200
+ for format_result in format_results {
201
+ clones.extend(format_result.clones);
202
+ skipped_clones.extend(format_result.skipped_clones);
203
+ }
204
+ dedup_exact_clones(&mut clones);
205
+ for clone in &clones {
206
+ update_clone_statistics(&mut statistics, clone);
207
+ }
208
+
209
+ finalize_percentages(&mut statistics);
210
+
211
+ DetectionResult {
212
+ clones,
213
+ skipped_clones,
214
+ statistics,
215
+ sources,
216
+ source_contents,
217
+ }
218
+ }
219
+
220
+ fn dedup_exact_clones(clones: &mut Vec<CloneMatch>) {
221
+ let mut seen = FxHashSet::default();
222
+ clones.retain(|clone| seen.insert(CloneDedupKey::from(clone)));
223
+ }
224
+
225
+ #[derive(Hash, Eq, PartialEq)]
226
+ struct CloneDedupKey {
227
+ format: String,
228
+ duplication_a: FragmentDedupKey,
229
+ duplication_b: FragmentDedupKey,
230
+ tokens: usize,
231
+ }
232
+
233
+ impl From<&CloneMatch> for CloneDedupKey {
234
+ fn from(clone: &CloneMatch) -> Self {
235
+ Self {
236
+ format: clone.format.clone(),
237
+ duplication_a: FragmentDedupKey::from(&clone.duplication_a),
238
+ duplication_b: FragmentDedupKey::from(&clone.duplication_b),
239
+ tokens: clone.tokens,
240
+ }
241
+ }
242
+ }
243
+
244
+ #[derive(Hash, Eq, PartialEq)]
245
+ struct FragmentDedupKey {
246
+ source_id: String,
247
+ start_line: usize,
248
+ start_column: usize,
249
+ end_line: usize,
250
+ end_column: usize,
251
+ range: [usize; 2],
252
+ }
253
+
254
+ impl From<&Fragment> for FragmentDedupKey {
255
+ fn from(fragment: &Fragment) -> Self {
256
+ Self {
257
+ source_id: fragment.source_id.clone(),
258
+ start_line: fragment.start.line,
259
+ start_column: fragment.start.column,
260
+ end_line: fragment.end.line,
261
+ end_column: fragment.end.column,
262
+ range: fragment.range,
263
+ }
264
+ }
265
+ }