npm - jscpd-rs - Versions diffs - 0.1.0 - Mend

jscpd-rs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

package/CHANGELOG.md +69 -0
package/Cargo.lock +1323 -0
package/Cargo.toml +54 -0
package/LICENSE +21 -0
package/README.md +372 -0
package/docs/api-parity.md +49 -0
package/docs/cloning-plan.md +281 -0
package/docs/compat-baseline.md +535 -0
package/docs/format-porting.md +86 -0
package/docs/junior-task-template.md +62 -0
package/docs/junior-workflow.md +87 -0
package/docs/migrating-from-jscpd.md +193 -0
package/docs/npm-release.md +116 -0
package/docs/public-benchmark-suite.md +81 -0
package/docs/release-checklist.md +200 -0
package/docs/release-decisions.md +103 -0
package/docs/release-readiness.md +51 -0
package/docs/upstream-bugs.md +501 -0
package/docs/upstream-issue-drafts.md +393 -0
package/docs/user-guide.md +309 -0
package/examples/dump_oxc_tokens.rs +112 -0
package/examples/library_api.rs +42 -0
package/npm/bin/jscpd-rs.js +6 -0
package/npm/bin/jscpd-server.js +6 -0
package/npm/lib/run-binary.js +68 -0
package/npm/scripts/postinstall.js +50 -0
package/package.json +53 -0
package/skills/dry-refactoring/SKILL.md +63 -0
package/skills/jscpd/SKILL.md +85 -0
package/src/app.rs +512 -0
package/src/bin/jscpd-server.rs +429 -0
package/src/blame.rs +130 -0
package/src/cli/config.rs +543 -0
package/src/cli/parsing.rs +301 -0
package/src/cli/tests.rs +543 -0
package/src/cli.rs +671 -0
package/src/detector/matching/secondary.rs +387 -0
package/src/detector/matching.rs +274 -0
package/src/detector/model.rs +190 -0
package/src/detector/prepare.rs +71 -0
package/src/detector/skip_local.rs +40 -0
package/src/detector/statistics.rs +138 -0
package/src/detector/store.rs +96 -0
package/src/detector/tests.rs +238 -0
package/src/detector.rs +265 -0
package/src/files/discovery.rs +508 -0
package/src/files/gitignore.rs +203 -0
package/src/files/paths.rs +68 -0
package/src/files/shebang.rs +106 -0
package/src/files/tests.rs +523 -0
package/src/files.rs +25 -0
package/src/formats.rs +570 -0
package/src/lib.rs +433 -0
package/src/main.rs +26 -0
package/src/report/ai.rs +125 -0
package/src/report/badge.rs +238 -0
package/src/report/console.rs +180 -0
package/src/report/console_common.rs +37 -0
package/src/report/console_full.rs +139 -0
package/src/report/csv.rs +65 -0
package/src/report/escape.rs +8 -0
package/src/report/file_output.rs +28 -0
package/src/report/html/assets.rs +47 -0
package/src/report/html.rs +336 -0
package/src/report/json.rs +119 -0
package/src/report/markdown.rs +125 -0
package/src/report/sarif.rs +302 -0
package/src/report/silent.rs +22 -0
package/src/report/source.rs +38 -0
package/src/report/summary.rs +50 -0
package/src/report/test_support.rs +133 -0
package/src/report/threshold.rs +76 -0
package/src/report/xcode.rs +90 -0
package/src/report/xml.rs +119 -0
package/src/report.rs +250 -0
package/src/server/mcp.rs +942 -0
package/src/server.rs +1081 -0
package/src/tokenizer/apex.rs +97 -0
package/src/tokenizer/blocks.rs +532 -0
package/src/tokenizer/embedded.rs +106 -0
package/src/tokenizer/generic.rs +511 -0
package/src/tokenizer/hash.rs +27 -0
package/src/tokenizer/ignore.rs +33 -0
package/src/tokenizer/line_index.rs +33 -0
package/src/tokenizer/markdown.rs +289 -0
package/src/tokenizer/markup_attrs.rs +289 -0
package/src/tokenizer/oxc/fallback.rs +275 -0
package/src/tokenizer/oxc/jsx.rs +168 -0
package/src/tokenizer/oxc/kind.rs +177 -0
package/src/tokenizer/oxc/lexical.rs +67 -0
package/src/tokenizer/oxc.rs +659 -0
package/src/tokenizer/scan.rs +88 -0
package/src/tokenizer/tap.rs +150 -0
package/src/tokenizer/tests.rs +915 -0
package/src/tokenizer.rs +328 -0
package/src/verbose.rs +195 -0

package/src/tokenizer/oxc.rs ADDED Viewed

@@ -0,0 +1,659 @@
+use std::path::Path;
+use oxc_allocator::Allocator;
+use oxc_parser::{Kind, Parser, Token as OxcToken, config::TokensParserConfig};
+use oxc_span::SourceType;
+use crate::cli::{Mode, Options};
+use super::scan::{has_code_in_gap, scan_block_comment, scan_line_comment};
+use super::{
+    ByteSpan, DetectionToken, LineIndex, TokenContext, TokenKind, TokenMap, hash_token,
+    push_strict_whitespace_tokens, push_token,
+};
+mod fallback;
+mod jsx;
+mod kind;
+mod lexical;
+use fallback::tokenize_js_like_range;
+use jsx::{jsx_attribute_script_groups, tokenize_jsx_attribute_scripts};
+use kind::oxc_token_kind;
+#[derive(Clone, Copy)]
+struct RawOxcToken {
+    kind: Kind,
+    span: ByteSpan,
+}
+pub(super) fn is_oxc_format(format: &str) -> bool {
+    matches!(format, "javascript" | "typescript" | "jsx" | "tsx" | "json")
+}
+pub(super) fn tokenize_oxc_maps(
+    content: &str,
+    format: &str,
+    options: &Options,
+    ignore_regions: &[[usize; 2]],
+) -> Vec<TokenMap> {
+    let context = TokenContext {
+        content,
+        options,
+        ignore_regions,
+    };
+    let allocator = Allocator::new();
+    let source_type = source_type_for_format(format);
+    let parser_return = Parser::new(&allocator, content, source_type)
+        .with_config(TokensParserConfig)
+        .parse();
+    let line_index = LineIndex::new(content);
+    let mut tokens = Vec::with_capacity(content.len().saturating_div(6));
+    let mut previous_end = 0usize;
+    let parser_tokens = parser_return.tokens;
+    let raw_jsx_tokens = if matches!(format, "jsx" | "tsx") {
+        Some(
+            parser_tokens
+                .iter()
+                .map(|token| raw_oxc_token(token, content.len()))
+                .collect::<Vec<_>>(),
+        )
+    } else {
+        None
+    };
+    let jsx_script_groups = if let Some(parser_tokens) = raw_jsx_tokens.as_deref() {
+        jsx_attribute_script_groups(parser_tokens)
+    } else {
+        Vec::new()
+    };
+    let mut idx = 0usize;
+    let mut template_expression_depth = 0usize;
+    while idx < parser_tokens.len() {
+        let token = raw_oxc_token(&parser_tokens[idx], content.len());
+        let start_byte = token.span.start;
+        let mut end_byte = token.span.end;
+        if start_byte > previous_end {
+            push_comments_in_gap(
+                &mut tokens,
+                &context,
+                previous_end,
+                start_byte,
+                &line_index,
+                template_expression_depth > 0,
+            );
+        }
+        if token.kind == Kind::RAngle {
+            while idx + 1 < parser_tokens.len() {
+                let next = raw_oxc_token(&parser_tokens[idx + 1], content.len());
+                if next.kind != Kind::RAngle || next.span.start != end_byte {
+                    break;
+                }
+                idx += 1;
+                end_byte = next.span.end;
+            }
+        }
+        let span = ByteSpan {
+            start: start_byte,
+            end: end_byte,
+        };
+        if token.kind == Kind::Slash
+            && context.slice(span) == "/"
+            && let Some(regex_end) = scan_regex_literal_end(content, start_byte, content.len())
+        {
+            push_token_part(
+                &mut tokens,
+                &context,
+                TokenKind::String,
+                ByteSpan {
+                    start: start_byte,
+                    end: regex_end,
+                },
+                &line_index,
+            );
+            previous_end = previous_end.max(regex_end);
+            idx += 1;
+            while idx < parser_tokens.len() {
+                let skipped = raw_oxc_token(&parser_tokens[idx], content.len());
+                if skipped.span.start >= regex_end {
+                    break;
+                }
+                previous_end = previous_end.max(skipped.span.end);
+                idx += 1;
+            }
+            continue;
+        }
+        push_oxc_token(&mut tokens, &context, token.kind, span, &line_index);
+        match token.kind {
+            Kind::TemplateHead => template_expression_depth += 1,
+            Kind::TemplateTail => {
+                template_expression_depth = template_expression_depth.saturating_sub(1);
+            }
+            _ => {}
+        }
+        previous_end = previous_end.max(end_byte);
+        idx += 1;
+    }
+    if previous_end < content.len() {
+        if has_code_in_gap(content, previous_end, content.len()) {
+            tokenize_js_like_range(
+                &mut tokens,
+                &context,
+                previous_end,
+                content.len(),
+                &line_index,
+            );
+        } else {
+            push_comments_in_gap(
+                &mut tokens,
+                &context,
+                previous_end,
+                content.len(),
+                &line_index,
+                false,
+            );
+        }
+    }
+    let mut maps = vec![TokenMap {
+        format: format.to_string(),
+        tokens,
+        positions_assigned: false,
+    }];
+    if matches!(format, "jsx" | "tsx") {
+        let parser_tokens = raw_jsx_tokens.as_deref().unwrap_or_default();
+        let embedded = tokenize_jsx_attribute_scripts(
+            parser_tokens,
+            &jsx_script_groups,
+            &context,
+            &line_index,
+        );
+        if !embedded.is_empty() {
+            maps.push(TokenMap {
+                format: "javascript".to_string(),
+                tokens: embedded,
+                positions_assigned: true,
+            });
+        }
+    }
+    maps
+}
+fn raw_oxc_token(token: &OxcToken, content_len: usize) -> RawOxcToken {
+    RawOxcToken {
+        kind: token.kind(),
+        span: ByteSpan {
+            start: (token.start() as usize).min(content_len),
+            end: (token.end() as usize).min(content_len),
+        },
+    }
+}
+fn source_type_for_format(format: &str) -> SourceType {
+    let filename = match format {
+        "javascript" => "input.jsx",
+        "typescript" => "input.ts",
+        "tsx" => "input.tsx",
+        "jsx" => "input.jsx",
+        _ => "input.js",
+    };
+    SourceType::from_path(Path::new(filename)).unwrap_or_else(|_| SourceType::default())
+}
+fn push_oxc_token(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    kind: Kind,
+    span: ByteSpan,
+    line_index: &LineIndex,
+) {
+    if span.start >= span.end {
+        return;
+    }
+    let value = context.slice(span);
+    if value.starts_with("//") {
+        if context.options.mode != Mode::Weak {
+            push_line_comment_tokens(tokens, context, span, line_index);
+        }
+        return;
+    }
+    if value.starts_with("#!") {
+        push_hashbang_tokens(tokens, context, span, line_index);
+        return;
+    }
+    if value.starts_with("/*") || value.starts_with("<!--") {
+        if context.options.mode != Mode::Weak {
+            push_comment_token(tokens, context, span, line_index);
+        }
+        return;
+    }
+    if kind == Kind::Skip {
+        return;
+    }
+    if kind == Kind::JSXText {
+        tokenize_js_like_range(tokens, context, span.start, span.end, line_index);
+        return;
+    }
+    if kind == Kind::Ident && value.contains('-') {
+        tokenize_js_like_range(tokens, context, span.start, span.end, line_index);
+        return;
+    }
+    if kind == Kind::RegExp && !regex_literal_allowed_at(context.content, span.start) {
+        tokenize_js_like_range(tokens, context, span.start, span.end, line_index);
+        return;
+    }
+    if matches!(
+        kind,
+        Kind::TemplateHead | Kind::TemplateMiddle | Kind::TemplateTail
+    ) {
+        push_template_token_parts(tokens, context, kind, span, line_index);
+        return;
+    }
+    if kind == Kind::QuestionDot && context.slice(span) == "?." {
+        push_token_part(
+            tokens,
+            context,
+            TokenKind::Operator,
+            ByteSpan {
+                start: span.start,
+                end: span.start + 1,
+            },
+            line_index,
+        );
+        push_token_part(
+            tokens,
+            context,
+            TokenKind::Punctuation,
+            ByteSpan {
+                start: span.start + 1,
+                end: span.end,
+            },
+            line_index,
+        );
+        return;
+    }
+    if context.overlaps_ignore_region(span) {
+        return;
+    }
+    tokens.push(DetectionToken {
+        hash: hash_token(
+            oxc_token_kind(kind, context.slice(span)),
+            context.slice(span),
+            context.options.ignore_case,
+        ),
+        start: line_index.location(span.start),
+        end: line_index.location(span.end),
+        range: [span.start, span.end],
+    });
+}
+pub(super) fn scan_regex_literal_end(
+    content: &str,
+    slash_start: usize,
+    limit: usize,
+) -> Option<usize> {
+    if !regex_literal_allowed_at(content, slash_start) {
+        return None;
+    }
+    let bytes = content.as_bytes();
+    if bytes.get(slash_start) != Some(&b'/')
+        || matches!(bytes.get(slash_start + 1), Some(b'/' | b'*'))
+    {
+        return None;
+    }
+    let mut idx = slash_start + 1;
+    let mut escaped = false;
+    let mut in_class = false;
+    let mut saw_body = false;
+    while idx < bytes.len().min(limit) {
+        let byte = bytes[idx];
+        if byte == b'\n' || byte == b'\r' {
+            return None;
+        }
+        if escaped {
+            escaped = false;
+            saw_body = true;
+            idx += 1;
+            continue;
+        }
+        match byte {
+            b'\\' => {
+                escaped = true;
+                saw_body = true;
+            }
+            b'[' => {
+                in_class = true;
+                saw_body = true;
+            }
+            b']' => {
+                in_class = false;
+                saw_body = true;
+            }
+            b'/' if !in_class => {
+                if !saw_body {
+                    return None;
+                }
+                idx += 1;
+                while idx < bytes.len().min(limit) && bytes[idx].is_ascii_alphabetic() {
+                    idx += 1;
+                }
+                return Some(idx);
+            }
+            _ => {
+                saw_body = true;
+            }
+        }
+        idx += 1;
+    }
+    None
+}
+fn regex_literal_allowed_at(content: &str, slash_start: usize) -> bool {
+    let Some((idx, previous)) = content[..slash_start]
+        .char_indices()
+        .rev()
+        .find(|(_, ch)| !ch.is_whitespace())
+    else {
+        return true;
+    };
+    if previous == '!' && content[..idx].chars().rev().find(|ch| !ch.is_whitespace()) == Some('#') {
+        return false;
+    }
+    if matches!(
+        previous,
+        '(' | '{'
+            | '='
+            | ':'
+            | ','
+            | ';'
+            | '!'
+            | '?'
+            | '&'
+            | '|'
+            | '+'
+            | '-'
+            | '*'
+            | '~'
+            | '^'
+            | '<'
+            | '>'
+    ) {
+        return true;
+    }
+    let word_end = idx + previous.len_utf8();
+    let mut word_start = idx;
+    while word_start > 0 {
+        let Some((prev_idx, ch)) = content[..word_start].char_indices().next_back() else {
+            break;
+        };
+        if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
+            word_start = prev_idx;
+        } else {
+            break;
+        }
+    }
+    matches!(
+        &content[word_start..word_end],
+        "return" | "throw" | "case" | "delete" | "typeof" | "void" | "new" | "yield" | "await"
+    )
+}
+fn push_template_token_parts(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    kind: Kind,
+    span: ByteSpan,
+    line_index: &LineIndex,
+) {
+    match kind {
+        Kind::TemplateHead => {
+            let interpolation_start = span.end.saturating_sub(2);
+            push_token_part(
+                tokens,
+                context,
+                TokenKind::String,
+                ByteSpan {
+                    start: span.start,
+                    end: interpolation_start,
+                },
+                line_index,
+            );
+            push_token_part(
+                tokens,
+                context,
+                TokenKind::Punctuation,
+                ByteSpan {
+                    start: interpolation_start,
+                    end: span.end,
+                },
+                line_index,
+            );
+        }
+        Kind::TemplateMiddle => {
+            push_token_part(
+                tokens,
+                context,
+                TokenKind::Punctuation,
+                ByteSpan {
+                    start: span.start,
+                    end: span.start.saturating_add(1),
+                },
+                line_index,
+            );
+            let interpolation_start = span.end.saturating_sub(2);
+            push_token_part(
+                tokens,
+                context,
+                TokenKind::String,
+                ByteSpan {
+                    start: span.start.saturating_add(1),
+                    end: interpolation_start,
+                },
+                line_index,
+            );
+            push_token_part(
+                tokens,
+                context,
+                TokenKind::Punctuation,
+                ByteSpan {
+                    start: interpolation_start,
+                    end: span.end,
+                },
+                line_index,
+            );
+        }
+        Kind::TemplateTail => {
+            push_token_part(
+                tokens,
+                context,
+                TokenKind::Punctuation,
+                ByteSpan {
+                    start: span.start,
+                    end: span.start.saturating_add(1),
+                },
+                line_index,
+            );
+            push_token_part(
+                tokens,
+                context,
+                TokenKind::String,
+                ByteSpan {
+                    start: span.start.saturating_add(1),
+                    end: span.end,
+                },
+                line_index,
+            );
+        }
+        _ => {}
+    }
+}
+fn push_token_part(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    kind: TokenKind,
+    span: ByteSpan,
+    line_index: &LineIndex,
+) {
+    if span.start >= span.end || context.overlaps_ignore_region(span) {
+        return;
+    }
+    push_token(
+        tokens,
+        context,
+        kind,
+        span,
+        line_index.location(span.start),
+        line_index.location(span.end),
+    );
+}
+fn push_comments_in_gap(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    gap_start: usize,
+    gap_end: usize,
+    line_index: &LineIndex,
+    preserve_whitespace_as_default: bool,
+) {
+    if gap_start >= gap_end {
+        return;
+    }
+    let bytes = context.content.as_bytes();
+    let mut idx = gap_start;
+    while idx < gap_end {
+        let ch = context.content[idx..].chars().next().unwrap_or('\0');
+        if ch.is_whitespace() {
+            let whitespace_end = scan_whitespace(context.content, idx, gap_end);
+            let span = ByteSpan {
+                start: idx,
+                end: whitespace_end,
+            };
+            if preserve_whitespace_as_default {
+                push_token_part(tokens, context, TokenKind::Default, span, line_index);
+            } else {
+                push_strict_whitespace_tokens(tokens, context, span, line_index);
+            }
+            idx = whitespace_end.max(idx + ch.len_utf8());
+            continue;
+        }
+        if idx + 1 >= gap_end {
+            break;
+        }
+        let is_hashbang = idx == 0 && bytes[idx] == b'#' && bytes[idx + 1] == b'!';
+        let is_line_comment = (bytes[idx] == b'/' && bytes[idx + 1] == b'/')
+            || bytes[idx..gap_end].starts_with(b"<!--");
+        let comment_end = if is_line_comment || is_hashbang {
+            Some(scan_line_comment(bytes, idx, gap_end))
+        } else if bytes[idx] == b'/' && bytes[idx + 1] == b'*' {
+            Some(scan_block_comment(bytes, idx, gap_end))
+        } else {
+            None
+        };
+        if let Some(comment_end) = comment_end {
+            if is_hashbang {
+                let span = ByteSpan {
+                    start: idx,
+                    end: comment_end,
+                };
+                push_hashbang_tokens(tokens, context, span, line_index);
+            } else if context.options.mode != Mode::Weak {
+                let span = ByteSpan {
+                    start: idx,
+                    end: comment_end,
+                };
+                if bytes[idx] == b'/' && bytes[idx + 1] == b'/' {
+                    push_line_comment_tokens(tokens, context, span, line_index);
+                } else {
+                    push_comment_token(tokens, context, span, line_index);
+                }
+            }
+            idx = comment_end.max(idx + 1);
+        } else {
+            idx += ch.len_utf8();
+        }
+    }
+}
+fn push_hashbang_tokens(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    span: ByteSpan,
+    line_index: &LineIndex,
+) {
+    let hash_span = ByteSpan {
+        start: span.start,
+        end: span.start + 1,
+    };
+    push_token_part(tokens, context, TokenKind::Default, hash_span, line_index);
+    tokenize_js_like_range(tokens, context, span.start + 1, span.end, line_index);
+}
+pub(super) fn push_line_comment_tokens(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    span: ByteSpan,
+    line_index: &LineIndex,
+) {
+    let mut part_start = None;
+    for (offset, ch) in context.slice(span).char_indices() {
+        let idx = span.start + offset;
+        if ch.is_whitespace() {
+            if let Some(start) = part_start.take() {
+                push_comment_token(tokens, context, ByteSpan { start, end: idx }, line_index);
+            }
+        } else if part_start.is_none() {
+            part_start = Some(idx);
+        }
+    }
+    if let Some(start) = part_start {
+        push_comment_token(
+            tokens,
+            context,
+            ByteSpan {
+                start,
+                end: span.end,
+            },
+            line_index,
+        );
+    }
+}
+fn scan_whitespace(content: &str, start: usize, limit: usize) -> usize {
+    let mut end = start;
+    while end < limit {
+        let ch = content[end..].chars().next().unwrap_or('\0');
+        if !ch.is_whitespace() {
+            break;
+        }
+        end += ch.len_utf8();
+    }
+    end
+}
+fn push_comment_token(
+    tokens: &mut Vec<DetectionToken>,
+    context: &TokenContext<'_>,
+    span: ByteSpan,
+    line_index: &LineIndex,
+) {
+    if span.start >= span.end || context.overlaps_ignore_region(span) {
+        return;
+    }
+    tokens.push(DetectionToken {
+        hash: hash_token(
+            TokenKind::Comment,
+            context.slice(span),
+            context.options.ignore_case,
+        ),
+        start: line_index.location(span.start),
+        end: line_index.location(span.end),
+        range: [span.start, span.end],
+    });
+}