npm - jscpd-rs - Versions diffs - 0.1.0 - Mend

jscpd-rs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

package/CHANGELOG.md +69 -0
package/Cargo.lock +1323 -0
package/Cargo.toml +54 -0
package/LICENSE +21 -0
package/README.md +372 -0
package/docs/api-parity.md +49 -0
package/docs/cloning-plan.md +281 -0
package/docs/compat-baseline.md +535 -0
package/docs/format-porting.md +86 -0
package/docs/junior-task-template.md +62 -0
package/docs/junior-workflow.md +87 -0
package/docs/migrating-from-jscpd.md +193 -0
package/docs/npm-release.md +116 -0
package/docs/public-benchmark-suite.md +81 -0
package/docs/release-checklist.md +200 -0
package/docs/release-decisions.md +103 -0
package/docs/release-readiness.md +51 -0
package/docs/upstream-bugs.md +501 -0
package/docs/upstream-issue-drafts.md +393 -0
package/docs/user-guide.md +309 -0
package/examples/dump_oxc_tokens.rs +112 -0
package/examples/library_api.rs +42 -0
package/npm/bin/jscpd-rs.js +6 -0
package/npm/bin/jscpd-server.js +6 -0
package/npm/lib/run-binary.js +68 -0
package/npm/scripts/postinstall.js +50 -0
package/package.json +53 -0
package/skills/dry-refactoring/SKILL.md +63 -0
package/skills/jscpd/SKILL.md +85 -0
package/src/app.rs +512 -0
package/src/bin/jscpd-server.rs +429 -0
package/src/blame.rs +130 -0
package/src/cli/config.rs +543 -0
package/src/cli/parsing.rs +301 -0
package/src/cli/tests.rs +543 -0
package/src/cli.rs +671 -0
package/src/detector/matching/secondary.rs +387 -0
package/src/detector/matching.rs +274 -0
package/src/detector/model.rs +190 -0
package/src/detector/prepare.rs +71 -0
package/src/detector/skip_local.rs +40 -0
package/src/detector/statistics.rs +138 -0
package/src/detector/store.rs +96 -0
package/src/detector/tests.rs +238 -0
package/src/detector.rs +265 -0
package/src/files/discovery.rs +508 -0
package/src/files/gitignore.rs +203 -0
package/src/files/paths.rs +68 -0
package/src/files/shebang.rs +106 -0
package/src/files/tests.rs +523 -0
package/src/files.rs +25 -0
package/src/formats.rs +570 -0
package/src/lib.rs +433 -0
package/src/main.rs +26 -0
package/src/report/ai.rs +125 -0
package/src/report/badge.rs +238 -0
package/src/report/console.rs +180 -0
package/src/report/console_common.rs +37 -0
package/src/report/console_full.rs +139 -0
package/src/report/csv.rs +65 -0
package/src/report/escape.rs +8 -0
package/src/report/file_output.rs +28 -0
package/src/report/html/assets.rs +47 -0
package/src/report/html.rs +336 -0
package/src/report/json.rs +119 -0
package/src/report/markdown.rs +125 -0
package/src/report/sarif.rs +302 -0
package/src/report/silent.rs +22 -0
package/src/report/source.rs +38 -0
package/src/report/summary.rs +50 -0
package/src/report/test_support.rs +133 -0
package/src/report/threshold.rs +76 -0
package/src/report/xcode.rs +90 -0
package/src/report/xml.rs +119 -0
package/src/report.rs +250 -0
package/src/server/mcp.rs +942 -0
package/src/server.rs +1081 -0
package/src/tokenizer/apex.rs +97 -0
package/src/tokenizer/blocks.rs +532 -0
package/src/tokenizer/embedded.rs +106 -0
package/src/tokenizer/generic.rs +511 -0
package/src/tokenizer/hash.rs +27 -0
package/src/tokenizer/ignore.rs +33 -0
package/src/tokenizer/line_index.rs +33 -0
package/src/tokenizer/markdown.rs +289 -0
package/src/tokenizer/markup_attrs.rs +289 -0
package/src/tokenizer/oxc/fallback.rs +275 -0
package/src/tokenizer/oxc/jsx.rs +168 -0
package/src/tokenizer/oxc/kind.rs +177 -0
package/src/tokenizer/oxc/lexical.rs +67 -0
package/src/tokenizer/oxc.rs +659 -0
package/src/tokenizer/scan.rs +88 -0
package/src/tokenizer/tap.rs +150 -0
package/src/tokenizer/tests.rs +915 -0
package/src/tokenizer.rs +328 -0
package/src/verbose.rs +195 -0

package/src/tokenizer/embedded.rs ADDED Viewed

@@ -0,0 +1,106 @@
+use crate::cli::Options;
+use super::generic::{generic_comment_span_end, scan_punctuation_split_token};
+use super::{ByteSpan, DetectionToken, LineIndex, Location, TokenContext, TokenKind, push_token};
+pub(super) fn blank_ranges_preserve_newlines(content: &str, ranges: &[[usize; 2]]) -> String {
+    if ranges.is_empty() {
+        return content.to_string();
+    }
+    let mut bytes = content.as_bytes().to_vec();
+    for [start, end] in ranges {
+        for byte in &mut bytes[*start..(*end).min(content.len())] {
+            if !matches!(*byte, b'\n' | b'\r') {
+                *byte = b' ';
+            }
+        }
+    }
+    String::from_utf8(bytes).unwrap_or_else(|_| content.to_string())
+}
+pub(super) fn offset_tokens(
+    tokens: &mut [DetectionToken],
+    offset: usize,
+    start_location: &Location,
+) {
+    for token in tokens {
+        offset_location(&mut token.start, offset, start_location);
+        offset_location(&mut token.end, offset, start_location);
+        token.range[0] += offset;
+        token.range[1] += offset;
+    }
+}
+pub(super) fn assign_sequential_positions(tokens: &mut [DetectionToken]) {
+    for (position, token) in tokens.iter_mut().enumerate() {
+        token.start.position = position;
+        token.end.position = position;
+    }
+}
+pub(super) fn tokenize_generic_with_whitespace(
+    content: &str,
+    format: &str,
+    options: &Options,
+    ignore_regions: &[[usize; 2]],
+) -> Vec<DetectionToken> {
+    let context = TokenContext {
+        content,
+        options,
+        ignore_regions,
+    };
+    let line_index = LineIndex::new(content);
+    let mut tokens = Vec::new();
+    let mut start_byte = 0usize;
+    while start_byte < content.len() {
+        let ch = content[start_byte..].chars().next().unwrap_or('\0');
+        let (end_byte, kind) = if ch.is_whitespace() {
+            (scan_whitespace(content, start_byte), TokenKind::Default)
+        } else if let Some(comment_end) =
+            generic_comment_span_end(content, format, start_byte, content.len())
+        {
+            (comment_end, TokenKind::Comment)
+        } else {
+            scan_punctuation_split_token(content, format, start_byte)
+        };
+        push_token(
+            &mut tokens,
+            &context,
+            kind,
+            ByteSpan {
+                start: start_byte,
+                end: end_byte,
+            },
+            line_index.location(start_byte),
+            line_index.location(end_byte),
+        );
+        start_byte = end_byte.max(start_byte + ch.len_utf8());
+    }
+    tokens
+}
+fn offset_location(location: &mut Location, offset: usize, start_location: &Location) {
+    if location.line == 1 {
+        location.column += start_location.column.saturating_sub(1);
+    }
+    location.line += start_location.line.saturating_sub(1);
+    location.position += offset;
+}
+fn scan_whitespace(content: &str, start: usize) -> usize {
+    let bytes = content.as_bytes();
+    if bytes[start] == b'\n' {
+        return start + 1;
+    }
+    let mut end = start;
+    while end < content.len() {
+        let ch = content[end..].chars().next().unwrap_or('\0');
+        if ch == '\n' || !ch.is_whitespace() {
+            break;
+        }
+        end += ch.len_utf8();
+    }
+    end
+}

package/src/tokenizer/generic.rs ADDED Viewed

@@ -0,0 +1,511 @@
+use crate::cli::{Mode, Options};
+use super::scan::scan_block_comment;
+use super::{
+    ByteSpan, DetectionToken, LineIndex, TokenContext, TokenKind, push_strict_whitespace_tokens,
+    push_token,
+};
+pub(super) fn tokenize_generic(
+    content: &str,
+    format: &str,
+    options: &Options,
+    ignore_regions: &[[usize; 2]],
+) -> Vec<DetectionToken> {
+    let context = TokenContext {
+        content,
+        options,
+        ignore_regions,
+    };
+    let line_index = LineIndex::new(content);
+    let mut tokens = Vec::new();
+    let mut start_byte = 0usize;
+    while start_byte < content.len() {
+        let ch = content[start_byte..].chars().next().unwrap_or('\0');
+        if ch.is_whitespace() {
+            let whitespace_end = scan_whitespace(content, start_byte);
+            if options.mode == Mode::Strict {
+                push_strict_whitespace_tokens(
+                    &mut tokens,
+                    &context,
+                    ByteSpan {
+                        start: start_byte,
+                        end: whitespace_end,
+                    },
+                    &line_index,
+                );
+            } else if format == "twig"
+                && twig_keeps_mild_whitespace(content, start_byte, whitespace_end)
+            {
+                // Prism's Twig grammar labels these spans as `default`, so
+                // upstream mild mode keeps them while filtering empty/new_line.
+                push_token(
+                    &mut tokens,
+                    &context,
+                    TokenKind::Default,
+                    ByteSpan {
+                        start: start_byte,
+                        end: whitespace_end,
+                    },
+                    line_index.location(start_byte),
+                    line_index.location(whitespace_end),
+                );
+            }
+            start_byte = whitespace_end.max(start_byte + ch.len_utf8());
+            continue;
+        }
+        let (end_byte, kind) = if let Some((special_end, special_kind)) =
+            generic_multiline_span_end(content, format, start_byte, content.len())
+        {
+            (special_end, special_kind)
+        } else if let Some(comment_end) =
+            generic_comment_span_end(content, format, start_byte, content.len())
+        {
+            (comment_end, TokenKind::Comment)
+        } else if format == "yaml" && matches!(ch, '"' | '\'') {
+            (scan_quoted_string(content, start_byte), TokenKind::String)
+        } else if punctuation_split_format(format) {
+            scan_punctuation_split_token(content, format, start_byte)
+        } else {
+            (scan_generic_token(content, start_byte), TokenKind::Default)
+        };
+        push_token(
+            &mut tokens,
+            &context,
+            kind,
+            ByteSpan {
+                start: start_byte,
+                end: end_byte,
+            },
+            line_index.location(start_byte),
+            line_index.location(end_byte),
+        );
+        start_byte = end_byte.max(start_byte + ch.len_utf8());
+    }
+    tokens
+}
+pub(super) fn scan_generic_token(content: &str, start: usize) -> usize {
+    let mut end = start;
+    while end < content.len() {
+        let ch = content[end..].chars().next().unwrap_or('\0');
+        if ch.is_whitespace() {
+            break;
+        }
+        end += ch.len_utf8();
+    }
+    end
+}
+pub(super) fn scan_punctuation_split_token(
+    content: &str,
+    format: &str,
+    start: usize,
+) -> (usize, TokenKind) {
+    let ch = content[start..].chars().next().unwrap_or('\0');
+    if is_split_punctuation(format, ch) {
+        return (start + ch.len_utf8(), TokenKind::Punctuation);
+    }
+    if code_like_format(format) && is_operator_start(ch) {
+        return (scan_operator_token(content, start), TokenKind::Operator);
+    }
+    let mut end = start;
+    while end < content.len() {
+        let ch = content[end..].chars().next().unwrap_or('\0');
+        if ch.is_whitespace()
+            || is_split_punctuation(format, ch)
+            || (code_like_format(format) && is_operator_start(ch))
+        {
+            break;
+        }
+        end += ch.len_utf8();
+    }
+    (end, TokenKind::Default)
+}
+fn scan_operator_token(content: &str, start: usize) -> usize {
+    let mut end = start;
+    while end < content.len() {
+        let ch = content[end..].chars().next().unwrap_or('\0');
+        if !is_operator_start(ch) {
+            break;
+        }
+        end += ch.len_utf8();
+    }
+    end
+}
+fn scan_quoted_string(content: &str, start: usize) -> usize {
+    let quote = content[start..].chars().next().unwrap_or('\0');
+    let mut escaped = false;
+    let mut end = start + quote.len_utf8();
+    while end < content.len() {
+        let ch = content[end..].chars().next().unwrap_or('\0');
+        end += ch.len_utf8();
+        if escaped {
+            escaped = false;
+            continue;
+        }
+        if ch == '\\' {
+            escaped = true;
+            continue;
+        }
+        if ch == quote || matches!(ch, '\n' | '\r') {
+            break;
+        }
+    }
+    end
+}
+fn generic_multiline_span_end(
+    content: &str,
+    format: &str,
+    start: usize,
+    limit: usize,
+) -> Option<(usize, TokenKind)> {
+    match format {
+        "haml" => haml_multiline_comment_span_end(content, start, limit)
+            .map(|end| (end, TokenKind::Comment)),
+        "pug" => pug_dot_block_span_end(content, start, limit).map(|end| (end, TokenKind::Default)),
+        _ => None,
+    }
+}
+fn haml_multiline_comment_span_end(content: &str, start: usize, limit: usize) -> Option<usize> {
+    let bytes = content.as_bytes();
+    let line_start = line_start(bytes, start);
+    if !line_prefix_is_indent(bytes, line_start, start) {
+        return None;
+    }
+    let rest = &bytes[start..limit];
+    if !(rest.starts_with(b"-#") || rest.starts_with(b"/")) {
+        return None;
+    }
+    Some(scan_indented_block_end(
+        bytes, line_start, start, limit, false,
+    ))
+}
+fn pug_dot_block_span_end(content: &str, start: usize, limit: usize) -> Option<usize> {
+    let bytes = content.as_bytes();
+    let line_start = line_start(bytes, start);
+    if !line_prefix_is_indent(bytes, line_start, start) {
+        return None;
+    }
+    let line_end = line_content_end(bytes, start, limit);
+    if !is_pug_dot_block_opener(&content[start..line_end]) {
+        return None;
+    }
+    let end = scan_indented_block_end(bytes, line_start, start, limit, true);
+    (end > line_end).then_some(end)
+}
+fn scan_indented_block_end(
+    bytes: &[u8],
+    line_start: usize,
+    start: usize,
+    limit: usize,
+    include_blank_lines: bool,
+) -> usize {
+    let base_indent = start.saturating_sub(line_start);
+    let mut end = line_content_end(bytes, start, limit);
+    let mut next_start = next_line_start(bytes, end, limit);
+    while next_start < limit {
+        let line_end = line_content_end(bytes, next_start, limit);
+        let indent_end = scan_indent(bytes, next_start, line_end);
+        let is_blank = indent_end == line_end;
+        let is_child = indent_end.saturating_sub(next_start) > base_indent;
+        if is_child || (include_blank_lines && is_blank) {
+            end = line_end;
+            next_start = next_line_start(bytes, line_end, limit);
+        } else {
+            break;
+        }
+    }
+    end
+}
+fn is_pug_dot_block_opener(line: &str) -> bool {
+    let trimmed = line.trim_end_matches([' ', '\t']);
+    let Some(head) = trimmed.strip_suffix('.') else {
+        return false;
+    };
+    !head.eq_ignore_ascii_case("script")
+        && !head.is_empty()
+        && head
+            .bytes()
+            .all(|byte| byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'#' | b'.'))
+}
+fn is_split_punctuation(format: &str, ch: char) -> bool {
+    matches!(ch, '{' | '}' | '(' | ')' | '[' | ']' | ':' | ';' | ',')
+        || (code_like_format(format) && ch == '.')
+}
+fn is_operator_start(ch: char) -> bool {
+    matches!(
+        ch,
+        '+' | '-' | '*' | '/' | '%' | '=' | '!' | '<' | '>' | '&' | '|' | '^' | '~' | '?'
+    )
+}
+pub(super) fn scan_whitespace(content: &str, start: usize) -> usize {
+    let mut end = start;
+    while end < content.len() {
+        let ch = content[end..].chars().next().unwrap_or('\0');
+        if !ch.is_whitespace() {
+            break;
+        }
+        end += ch.len_utf8();
+    }
+    end
+}
+fn twig_keeps_mild_whitespace(content: &str, start: usize, end: usize) -> bool {
+    if start >= end {
+        return false;
+    }
+    let has_newline = content[start..end].bytes().any(|byte| byte == b'\n');
+    if !has_newline {
+        return previous_non_whitespace(content, start).is_some()
+            && next_non_whitespace(content, end).is_some();
+    }
+    matches!(
+        (
+            previous_non_whitespace(content, start),
+            next_non_whitespace(content, end)
+        ),
+        (Some(b'>'), Some(b'<'))
+    )
+}
+fn previous_non_whitespace(content: &str, end: usize) -> Option<u8> {
+    content[..end]
+        .bytes()
+        .rev()
+        .find(|byte| !byte.is_ascii_whitespace())
+}
+fn next_non_whitespace(content: &str, start: usize) -> Option<u8> {
+    content[start..]
+        .bytes()
+        .find(|byte| !byte.is_ascii_whitespace())
+}
+pub(super) fn generic_comment_span_end(
+    content: &str,
+    format: &str,
+    start: usize,
+    limit: usize,
+) -> Option<usize> {
+    let bytes = content.as_bytes();
+    let rest = &bytes[start..limit];
+    if rest.starts_with(b"<!--") {
+        return Some(scan_html_comment(bytes, start, limit));
+    }
+    if rest.starts_with(b"/*") {
+        return Some(scan_block_comment(bytes, start, limit));
+    }
+    if rest.starts_with(b"//") {
+        return Some(scan_to_line_end(bytes, start, limit));
+    }
+    if rest.starts_with(b"--") && generic_double_dash_comment_format(format) {
+        return Some(scan_to_line_end(bytes, start, limit));
+    }
+    if bytes[start] == b'#' && generic_hash_comment_format(format) {
+        return Some(scan_to_line_end(bytes, start, limit));
+    }
+    if bytes[start] == b';' && generic_semicolon_comment_format(format) {
+        return Some(scan_to_line_end(bytes, start, limit));
+    }
+    None
+}
+fn generic_hash_comment_format(format: &str) -> bool {
+    matches!(
+        format,
+        "apacheconf"
+            | "applescript"
+            | "bash"
+            | "cmake"
+            | "docker"
+            | "editorconfig"
+            | "git"
+            | "ignore"
+            | "ini"
+            | "julia"
+            | "makefile"
+            | "nginx"
+            | "nix"
+            | "perl"
+            | "powershell"
+            | "properties"
+            | "python"
+            | "r"
+            | "ruby"
+            | "shell-session"
+            | "tcl"
+            | "toml"
+            | "vim"
+            | "yaml"
+    )
+}
+fn generic_double_dash_comment_format(format: &str) -> bool {
+    matches!(
+        format,
+        "ada" | "applescript" | "elm" | "haskell" | "lua" | "plsql" | "sql"
+    )
+}
+fn generic_semicolon_comment_format(format: &str) -> bool {
+    matches!(
+        format,
+        "asm6502"
+            | "autoit"
+            | "autohotkey"
+            | "clojure"
+            | "ini"
+            | "lisp"
+            | "llvm"
+            | "nasm"
+            | "racket"
+            | "scheme"
+    )
+}
+fn punctuation_split_format(format: &str) -> bool {
+    css_like_format(format) || code_like_format(format)
+}
+fn css_like_format(format: &str) -> bool {
+    matches!(format, "css" | "less" | "sass" | "scss" | "stylus")
+}
+fn code_like_format(format: &str) -> bool {
+    matches!(
+        format,
+        "ada"
+            | "apex"
+            | "aspnet"
+            | "c"
+            | "c-header"
+            | "clike"
+            | "clojure"
+            | "cmake"
+            | "coffeescript"
+            | "cpp"
+            | "cpp-header"
+            | "csharp"
+            | "csv"
+            | "cfml"
+            | "cfscript"
+            | "dart"
+            | "dot"
+            | "eiffel"
+            | "go"
+            | "haml"
+            | "ini"
+            | "java"
+            | "kotlin"
+            | "haxe"
+            | "markup"
+            | "objectivec"
+            | "ocaml"
+            | "perl"
+            | "php"
+            | "plsql"
+            | "properties"
+            | "purescript"
+            | "python"
+            | "qsharp"
+            | "r"
+            | "rescript"
+            | "robotframework"
+            | "rust"
+            | "scala"
+            | "solidity"
+            | "sparql"
+            | "swift"
+            | "tcl"
+            | "tt2"
+            | "turtle"
+            | "twig"
+            | "verilog"
+            | "wgsl"
+            | "yaml"
+            | "zig"
+    )
+}
+fn scan_to_line_end(bytes: &[u8], start: usize, limit: usize) -> usize {
+    let mut idx = start;
+    while idx < limit && bytes[idx] != b'\n' {
+        idx += 1;
+    }
+    idx
+}
+fn line_start(bytes: &[u8], start: usize) -> usize {
+    let mut idx = start;
+    while idx > 0 && !matches!(bytes[idx - 1], b'\n' | b'\r') {
+        idx -= 1;
+    }
+    idx
+}
+fn line_prefix_is_indent(bytes: &[u8], line_start: usize, start: usize) -> bool {
+    bytes[line_start..start]
+        .iter()
+        .all(|byte| matches!(byte, b' ' | b'\t'))
+}
+fn line_content_end(bytes: &[u8], start: usize, limit: usize) -> usize {
+    let mut idx = start;
+    while idx < limit && !matches!(bytes[idx], b'\n' | b'\r') {
+        idx += 1;
+    }
+    idx
+}
+fn next_line_start(bytes: &[u8], line_end: usize, limit: usize) -> usize {
+    if line_end >= limit {
+        return limit;
+    }
+    if bytes[line_end] == b'\r' && line_end + 1 < limit && bytes[line_end + 1] == b'\n' {
+        line_end + 2
+    } else {
+        line_end + 1
+    }
+}
+fn scan_indent(bytes: &[u8], start: usize, limit: usize) -> usize {
+    let mut idx = start;
+    while idx < limit && matches!(bytes[idx], b' ' | b'\t') {
+        idx += 1;
+    }
+    idx
+}
+fn scan_html_comment(bytes: &[u8], start: usize, limit: usize) -> usize {
+    let mut idx = start + 4;
+    while idx + 2 < limit {
+        if bytes[idx] == b'-' && bytes[idx + 1] == b'-' && bytes[idx + 2] == b'>' {
+            return idx + 3;
+        }
+        idx += 1;
+    }
+    limit
+}

package/src/tokenizer/hash.rs ADDED Viewed

@@ -0,0 +1,27 @@
+use xxhash_rust::xxh3::xxh3_64;
+use super::TokenKind;
+pub(super) fn hash_token(kind: TokenKind, value: &str, ignore_case: bool) -> u64 {
+    let kind_hash = match kind {
+        TokenKind::Comment => 0x01_u64,
+        TokenKind::Constant => 0x08_u64,
+        TokenKind::Empty => 0x09_u64,
+        TokenKind::Keyword => 0x02_u64,
+        TokenKind::NewLine => 0x0a_u64,
+        TokenKind::Number => 0x03_u64,
+        TokenKind::Operator => 0x04_u64,
+        TokenKind::Punctuation => 0x05_u64,
+        TokenKind::String => 0x06_u64,
+        TokenKind::Default => 0x07_u64,
+    };
+    hash_value(value, ignore_case) ^ kind_hash
+}
+fn hash_value(value: &str, ignore_case: bool) -> u64 {
+    if ignore_case {
+        xxh3_64(value.to_lowercase().as_bytes())
+    } else {
+        xxh3_64(value.as_bytes())
+    }
+}

package/src/tokenizer/ignore.rs ADDED Viewed

@@ -0,0 +1,33 @@
+use crate::cli::Options;
+pub(super) fn find_ignore_regions(content: &str, options: &Options) -> Vec<[usize; 2]> {
+    let mut regions = Vec::new();
+    let start_marker = "jscpd:ignore-start";
+    let end_marker = "jscpd:ignore-end";
+    let mut search_from = 0;
+    while let Some(marker_start) = content[search_from..].find(start_marker) {
+        let marker_start = search_from + marker_start;
+        let line_start = content[..marker_start]
+            .rfind('\n')
+            .map(|idx| idx + 1)
+            .unwrap_or(0);
+        let after_start = marker_start + start_marker.len();
+        let Some(marker_end_rel) = content[after_start..].find(end_marker) else {
+            break;
+        };
+        let marker_end = after_start + marker_end_rel;
+        let line_end = content[marker_end..]
+            .find('\n')
+            .map(|idx| marker_end + idx)
+            .unwrap_or(content.len());
+        regions.push([line_start, line_end]);
+        search_from = line_end;
+    }
+    for pattern in &options.ignore_pattern {
+        regions.extend(pattern.find_iter(content).map(|m| [m.start(), m.end()]));
+    }
+    regions
+}

package/src/tokenizer/line_index.rs ADDED Viewed

@@ -0,0 +1,33 @@
+use super::Location;
+pub(super) struct LineIndex {
+    newlines: Vec<usize>,
+}
+impl LineIndex {
+    pub(super) fn new(content: &str) -> Self {
+        Self {
+            newlines: content
+                .bytes()
+                .enumerate()
+                .filter_map(|(idx, byte)| (byte == b'\n').then_some(idx))
+                .collect(),
+        }
+    }
+    pub(super) fn location(&self, offset: usize) -> Location {
+        let previous_newlines = self
+            .newlines
+            .partition_point(|newline_offset| *newline_offset < offset);
+        let line_start = if previous_newlines == 0 {
+            0
+        } else {
+            self.newlines[previous_newlines - 1] + 1
+        };
+        Location {
+            line: previous_newlines + 1,
+            column: offset - line_start + 1,
+            position: offset,
+        }
+    }
+}