npm - jscpd-rs - Versions diffs - 0.1.0 - Mend

jscpd-rs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

package/CHANGELOG.md +69 -0
package/Cargo.lock +1323 -0
package/Cargo.toml +54 -0
package/LICENSE +21 -0
package/README.md +372 -0
package/docs/api-parity.md +49 -0
package/docs/cloning-plan.md +281 -0
package/docs/compat-baseline.md +535 -0
package/docs/format-porting.md +86 -0
package/docs/junior-task-template.md +62 -0
package/docs/junior-workflow.md +87 -0
package/docs/migrating-from-jscpd.md +193 -0
package/docs/npm-release.md +116 -0
package/docs/public-benchmark-suite.md +81 -0
package/docs/release-checklist.md +200 -0
package/docs/release-decisions.md +103 -0
package/docs/release-readiness.md +51 -0
package/docs/upstream-bugs.md +501 -0
package/docs/upstream-issue-drafts.md +393 -0
package/docs/user-guide.md +309 -0
package/examples/dump_oxc_tokens.rs +112 -0
package/examples/library_api.rs +42 -0
package/npm/bin/jscpd-rs.js +6 -0
package/npm/bin/jscpd-server.js +6 -0
package/npm/lib/run-binary.js +68 -0
package/npm/scripts/postinstall.js +50 -0
package/package.json +53 -0
package/skills/dry-refactoring/SKILL.md +63 -0
package/skills/jscpd/SKILL.md +85 -0
package/src/app.rs +512 -0
package/src/bin/jscpd-server.rs +429 -0
package/src/blame.rs +130 -0
package/src/cli/config.rs +543 -0
package/src/cli/parsing.rs +301 -0
package/src/cli/tests.rs +543 -0
package/src/cli.rs +671 -0
package/src/detector/matching/secondary.rs +387 -0
package/src/detector/matching.rs +274 -0
package/src/detector/model.rs +190 -0
package/src/detector/prepare.rs +71 -0
package/src/detector/skip_local.rs +40 -0
package/src/detector/statistics.rs +138 -0
package/src/detector/store.rs +96 -0
package/src/detector/tests.rs +238 -0
package/src/detector.rs +265 -0
package/src/files/discovery.rs +508 -0
package/src/files/gitignore.rs +203 -0
package/src/files/paths.rs +68 -0
package/src/files/shebang.rs +106 -0
package/src/files/tests.rs +523 -0
package/src/files.rs +25 -0
package/src/formats.rs +570 -0
package/src/lib.rs +433 -0
package/src/main.rs +26 -0
package/src/report/ai.rs +125 -0
package/src/report/badge.rs +238 -0
package/src/report/console.rs +180 -0
package/src/report/console_common.rs +37 -0
package/src/report/console_full.rs +139 -0
package/src/report/csv.rs +65 -0
package/src/report/escape.rs +8 -0
package/src/report/file_output.rs +28 -0
package/src/report/html/assets.rs +47 -0
package/src/report/html.rs +336 -0
package/src/report/json.rs +119 -0
package/src/report/markdown.rs +125 -0
package/src/report/sarif.rs +302 -0
package/src/report/silent.rs +22 -0
package/src/report/source.rs +38 -0
package/src/report/summary.rs +50 -0
package/src/report/test_support.rs +133 -0
package/src/report/threshold.rs +76 -0
package/src/report/xcode.rs +90 -0
package/src/report/xml.rs +119 -0
package/src/report.rs +250 -0
package/src/server/mcp.rs +942 -0
package/src/server.rs +1081 -0
package/src/tokenizer/apex.rs +97 -0
package/src/tokenizer/blocks.rs +532 -0
package/src/tokenizer/embedded.rs +106 -0
package/src/tokenizer/generic.rs +511 -0
package/src/tokenizer/hash.rs +27 -0
package/src/tokenizer/ignore.rs +33 -0
package/src/tokenizer/line_index.rs +33 -0
package/src/tokenizer/markdown.rs +289 -0
package/src/tokenizer/markup_attrs.rs +289 -0
package/src/tokenizer/oxc/fallback.rs +275 -0
package/src/tokenizer/oxc/jsx.rs +168 -0
package/src/tokenizer/oxc/kind.rs +177 -0
package/src/tokenizer/oxc/lexical.rs +67 -0
package/src/tokenizer/oxc.rs +659 -0
package/src/tokenizer/scan.rs +88 -0
package/src/tokenizer/tap.rs +150 -0
package/src/tokenizer/tests.rs +915 -0
package/src/tokenizer.rs +328 -0
package/src/verbose.rs +195 -0

package/src/tokenizer.rs ADDED Viewed

@@ -0,0 +1,328 @@
+mod apex;
+mod blocks;
+mod embedded;
+mod generic;
+mod hash;
+mod ignore;
+mod line_index;
+mod markdown;
+mod markup_attrs;
+mod oxc;
+mod scan;
+mod tap;
+use serde::Serialize;
+use crate::cli::{Mode, Options};
+use generic::tokenize_generic;
+use hash::hash_token;
+use ignore::find_ignore_regions;
+use line_index::LineIndex;
+use oxc::{is_oxc_format, tokenize_oxc_maps};
+use scan::count_prism_whitespace_tokens;
+/// One-based source location used in tokens, fragments, and reports.
+#[derive(Clone, Debug, Serialize)]
+pub struct Location {
+    /// One-based line number.
+    pub line: usize,
+    /// Zero-based column number.
+    pub column: usize,
+    /// Zero-based byte position in the original source text.
+    pub position: usize,
+}
+/// Detection token after mode filtering and jscpd-compatible hashing.
+#[derive(Clone, Debug)]
+pub struct DetectionToken {
+    /// Stable token hash used by the duplicate detector.
+    pub hash: u64,
+    /// Start location of the token.
+    pub start: Location,
+    /// End location of the token.
+    pub end: Location,
+    /// Byte range in the original source text.
+    pub range: [usize; 2],
+}
+/// Token map for a single detected format block.
+///
+/// Embedded formats can produce more than one map for one source document, for
+/// example script/style blocks extracted from markup-like files.
+#[derive(Clone, Debug)]
+pub struct TokenMap {
+    /// Format name associated with this token map.
+    pub format: String,
+    /// Detection tokens in source order.
+    pub tokens: Vec<DetectionToken>,
+    positions_assigned: bool,
+}
+/// Token map associated with a source identifier and line count.
+#[derive(Clone, Debug)]
+pub struct SourceTokenMap {
+    /// Stable source identifier, usually a file path.
+    pub source_id: String,
+    /// Format name associated with this token map.
+    pub format: String,
+    /// Detection tokens in source order.
+    pub tokens: Vec<DetectionToken>,
+    /// Total source lines represented by this map.
+    pub lines: usize,
+}
+/// Native tokenizer used by the detector.
+///
+/// JS/TS/JSX/TSX formats use Oxc-backed tokenization. Long-tail formats use
+/// the generic native tokenizer unless a format has a dedicated implementation.
+#[derive(Clone, Debug)]
+pub struct Tokenizer {
+    options: Options,
+}
+impl Default for Tokenizer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+impl Tokenizer {
+    /// Create a tokenizer with default detector options.
+    pub fn new() -> Self {
+        Self {
+            options: Options::default(),
+        }
+    }
+    /// Create a tokenizer with caller-provided options.
+    pub fn with_options(options: Options) -> Self {
+        Self { options }
+    }
+    /// Return the options used by this tokenizer.
+    pub fn options(&self) -> &Options {
+        &self.options
+    }
+    /// Mutably access tokenizer options.
+    pub fn options_mut(&mut self) -> &mut Options {
+        &mut self.options
+    }
+    /// Tokenize a source string and return the first token stream.
+    ///
+    /// Use [`Tokenizer::tokenize_maps`] when a format can produce multiple
+    /// embedded token maps.
+    pub fn tokenize(&self, content: &str, format: &str) -> Vec<DetectionToken> {
+        self.tokenize_maps(content, format)
+            .into_iter()
+            .next()
+            .map(|map| map.tokens)
+            .unwrap_or_default()
+    }
+    /// Tokenize source text into one or more format-specific token maps.
+    pub fn tokenize_maps(&self, content: &str, format: &str) -> Vec<TokenMap> {
+        tokenize_maps_for_detection(content, format, &self.options)
+    }
+    /// Tokenize source text and attach a source identifier to each generated map.
+    pub fn generate_maps(
+        &self,
+        source_id: impl Into<String>,
+        content: &str,
+        format: &str,
+    ) -> Vec<SourceTokenMap> {
+        let source_id = source_id.into();
+        self.tokenize_maps(content, format)
+            .into_iter()
+            .map(|map| SourceTokenMap {
+                source_id: source_id.clone(),
+                lines: token_map_line_count(&map.tokens),
+                format: map.format,
+                tokens: map.tokens,
+            })
+            .collect()
+    }
+}
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum TokenKind {
+    Comment,
+    Constant,
+    Empty,
+    Keyword,
+    NewLine,
+    Number,
+    Operator,
+    Punctuation,
+    String,
+    Default,
+}
+#[derive(Clone, Copy)]
+struct ByteSpan {
+    start: usize,
+    end: usize,
+}
+struct TokenContext<'a> {
+    content: &'a str,
+    options: &'a Options,
+    ignore_regions: &'a [[usize; 2]],
+}
+impl TokenContext<'_> {
+    fn slice(&self, span: ByteSpan) -> &str {
+        &self.content[span.start..span.end]
+    }
+    fn overlaps_ignore_region(&self, span: ByteSpan) -> bool {
+        self.ignore_regions
+            .iter()
+            .any(|[region_start, region_end]| span.start < *region_end && span.end > *region_start)
+    }
+}
+#[cfg(test)]
+fn tokenize_for_detection(content: &str, format: &str, options: &Options) -> Vec<DetectionToken> {
+    tokenize_maps_for_detection(content, format, options)
+        .into_iter()
+        .next()
+        .map(|map| map.tokens)
+        .unwrap_or_default()
+}
+pub fn tokenize_maps_for_detection(
+    content: &str,
+    format: &str,
+    options: &Options,
+) -> Vec<TokenMap> {
+    let ignore_regions = find_ignore_regions(content, options);
+    let mut maps = if format == "markdown" {
+        markdown::tokenize_maps(content, options, &ignore_regions)
+    } else if format == "apex" {
+        apex::tokenize_maps(content, options, &ignore_regions)
+    } else if format == "tap" {
+        tap::tokenize_maps(content, options, &ignore_regions)
+    } else if matches!(format, "markup" | "vue" | "svelte" | "astro") {
+        blocks::tokenize_maps(content, format, options, &ignore_regions)
+    } else if is_oxc_format(format) {
+        tokenize_oxc_maps(content, format, options, &ignore_regions)
+    } else {
+        vec![TokenMap {
+            format: format.to_string(),
+            tokens: tokenize_generic(content, format, options, &ignore_regions),
+            positions_assigned: false,
+        }]
+    };
+    for map in &mut maps {
+        if !map.positions_assigned {
+            assign_token_positions(content, &map.format, options, &mut map.tokens);
+        }
+    }
+    maps
+}
+fn token_map_line_count(tokens: &[DetectionToken]) -> usize {
+    match (tokens.first(), tokens.last()) {
+        (Some(first), Some(last)) => last.end.line.saturating_sub(first.start.line),
+        _ => 0,
+    }
+}
+fn assign_token_positions(
+    content: &str,
+    format: &str,
+    options: &Options,
+    tokens: &mut [DetectionToken],
+) {
+    let needs_report_positions =
+        options.reporters.iter().any(|reporter| reporter == "json") || !options.silent;
+    if !needs_report_positions || !matches!(format, "javascript" | "typescript" | "jsx" | "tsx") {
+        for (position, token) in tokens.iter_mut().enumerate() {
+            token.start.position = position;
+            token.end.position = position;
+        }
+        return;
+    }
+    let mut position = 0usize;
+    let mut previous_end = 0usize;
+    for token in tokens {
+        if token.range[0] > previous_end {
+            position += count_prism_whitespace_tokens(content, previous_end, token.range[0]);
+        }
+        token.start.position = position;
+        token.end.position = position;
+        position += 1;
+        previous_end = previous_end.max(token.range[1]);
+    }
+}
+fn push_token(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    kind: TokenKind,
+    span: ByteSpan,
+    start: Location,
+    end: Location,
+) {
+    if context.options.mode == Mode::Weak && kind == TokenKind::Comment {
+        return;
+    }
+    if context.overlaps_ignore_region(span) {
+        return;
+    }
+    tokens.push(DetectionToken {
+        hash: hash_token(kind, context.slice(span), context.options.ignore_case),
+        start,
+        end,
+        range: [span.start, span.end],
+    });
+}
+fn push_strict_whitespace_tokens(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    span: ByteSpan,
+    line_index: &LineIndex,
+) {
+    if context.options.mode != Mode::Strict {
+        return;
+    }
+    let mut start = span.start;
+    while start < span.end {
+        let (end, kind) = scan_whitespace_token(context.content, start, span.end);
+        push_token(
+            tokens,
+            context,
+            kind,
+            ByteSpan { start, end },
+            line_index.location(start),
+            line_index.location(end),
+        );
+        start = end.max(start + 1);
+    }
+}
+fn scan_whitespace_token(content: &str, start: usize, limit: usize) -> (usize, TokenKind) {
+    let bytes = content.as_bytes();
+    if bytes[start] == b'\n' {
+        return (start + 1, TokenKind::NewLine);
+    }
+    let mut end = start;
+    while end < limit {
+        let ch = content[end..].chars().next().unwrap_or('\0');
+        if ch == '\n' || !ch.is_whitespace() {
+            break;
+        }
+        end += ch.len_utf8();
+    }
+    (end, TokenKind::Empty)
+}
+#[cfg(test)]
+mod tests;

package/src/verbose.rs ADDED Viewed

@@ -0,0 +1,195 @@
+use std::time::{SystemTime, UNIX_EPOCH};
+use serde::Serialize;
+use crate::detector::{CloneMatch, DetectionResult, Fragment, SkippedClone};
+use crate::tokenizer::Location;
+const GREY: &str = "\x1b[90m";
+const YELLOW: &str = "\x1b[33m";
+const RESET_COLOR: &str = "\x1b[39m";
+pub fn write_detection_events(result: &DetectionResult) {
+    print!("{}", detection_events_output(result, current_time_millis()));
+}
+fn detection_events_output(result: &DetectionResult, found_date: u128) -> String {
+    let mut output = String::new();
+    let mut emitted = vec![false; result.clones.len()];
+    let mut emitted_skipped = vec![false; result.skipped_clones.len()];
+    for source in result.sources.iter().rev() {
+        output.push_str(&format!("{YELLOW}START_DETECTION{RESET_COLOR}\n"));
+        output.push_str(&format!(
+            "{GREY}Start detection for source id={} format={}{RESET_COLOR}\n",
+            source.path, source.format
+        ));
+        for (idx, clone) in result.clones.iter().enumerate() {
+            if emitted[idx]
+                || clone.format != source.format
+                || clone.duplication_a.source_id != source.path
+            {
+                continue;
+            }
+            push_clone_found(&mut output, clone, found_date + idx as u128);
+            emitted[idx] = true;
+        }
+        for (idx, skipped) in result.skipped_clones.iter().enumerate() {
+            if emitted_skipped[idx]
+                || skipped.clone.format != source.format
+                || skipped.clone.duplication_a.source_id != source.path
+            {
+                continue;
+            }
+            push_clone_skipped(&mut output, skipped);
+            emitted_skipped[idx] = true;
+        }
+    }
+    for (idx, clone) in result.clones.iter().enumerate() {
+        if !emitted[idx] {
+            push_clone_found(&mut output, clone, found_date + idx as u128);
+        }
+    }
+    for (idx, skipped) in result.skipped_clones.iter().enumerate() {
+        if !emitted_skipped[idx] {
+            push_clone_skipped(&mut output, skipped);
+        }
+    }
+    output
+}
+fn push_clone_found(output: &mut String, clone: &CloneMatch, found_date: u128) {
+    output.push_str(&format!("{YELLOW}CLONE_FOUND{RESET_COLOR}\n"));
+    if let Ok(json) = serde_json::to_string_pretty(&VerboseClone::new(clone, found_date)) {
+        for line in json.lines() {
+            output.push_str(GREY);
+            output.push_str(line);
+            output.push_str(RESET_COLOR);
+            output.push('\n');
+        }
+    }
+}
+fn push_clone_skipped(output: &mut String, skipped: &SkippedClone) {
+    output.push_str(&format!("{YELLOW}CLONE_SKIPPED{RESET_COLOR}\n"));
+    output.push_str(&format!(
+        "{GREY}Clone skipped: {}{RESET_COLOR}\n",
+        skipped.message.join(" ")
+    ));
+}
+fn current_time_millis() -> u128 {
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|duration| duration.as_millis())
+        .unwrap_or_default()
+}
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct VerboseClone<'a> {
+    format: &'a str,
+    found_date: u128,
+    duplication_a: VerboseFragment<'a>,
+    duplication_b: VerboseFragment<'a>,
+}
+impl<'a> VerboseClone<'a> {
+    fn new(clone: &'a CloneMatch, found_date: u128) -> Self {
+        Self {
+            format: &clone.format,
+            found_date,
+            duplication_a: VerboseFragment::new(&clone.duplication_a),
+            duplication_b: VerboseFragment::new(&clone.duplication_b),
+        }
+    }
+}
+#[derive(Serialize)]
+struct VerboseFragment<'a> {
+    #[serde(rename = "sourceId")]
+    source_id: &'a str,
+    start: &'a Location,
+    end: &'a Location,
+    range: [usize; 2],
+}
+impl<'a> VerboseFragment<'a> {
+    fn new(fragment: &'a Fragment) -> Self {
+        Self {
+            source_id: &fragment.source_id,
+            start: &fragment.start,
+            end: &fragment.end,
+            range: fragment.range,
+        }
+    }
+}
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use crate::detector::{
+        CloneMatch, DetectionResult, Fragment, SkippedClone, SourceSummary, Statistics,
+    };
+    use crate::tokenizer::Location;
+    use super::detection_events_output;
+    #[test]
+    fn verbose_events_match_upstream_event_shape() {
+        let result = DetectionResult {
+            clones: vec![CloneMatch {
+                format: "javascript".to_string(),
+                duplication_a: fragment("src/a.js", 2),
+                duplication_b: fragment("src/b.js", 8),
+                tokens: 6,
+            }],
+            skipped_clones: vec![SkippedClone {
+                clone: CloneMatch {
+                    format: "javascript".to_string(),
+                    duplication_a: fragment("src/a.js", 20),
+                    duplication_b: fragment("src/b.js", 30),
+                    tokens: 3,
+                },
+                message: vec!["Lines of code less than limit (2 < 5)".to_string()],
+            }],
+            statistics: Statistics::default(),
+            sources: vec![SourceSummary {
+                path: "src/a.js".to_string(),
+                format: "javascript".to_string(),
+                lines: 10,
+                tokens: 20,
+            }],
+            source_contents: HashMap::new(),
+        };
+        let output = detection_events_output(&result, 123);
+        assert!(output.contains("START_DETECTION"));
+        assert!(output.contains("Start detection for source id=src/a.js format=javascript"));
+        assert!(output.contains("CLONE_FOUND"));
+        assert!(output.contains("CLONE_SKIPPED"));
+        assert!(output.contains("Clone skipped: Lines of code less than limit (2 < 5)"));
+        assert!(output.contains(r#""foundDate": 123"#));
+        assert!(output.contains(r#""sourceId": "src/a.js""#));
+        assert!(!output.contains(r#""tokens""#));
+    }
+    fn fragment(source_id: &str, line: usize) -> Fragment {
+        Fragment {
+            source_id: source_id.to_string(),
+            start: location(line, 1, 0),
+            end: location(line + 3, 1, 6),
+            range: [0, 6],
+            blame: None,
+        }
+    }
+    fn location(line: usize, column: usize, position: usize) -> Location {
+        Location {
+            line,
+            column,
+            position,
+        }
+    }
+}