@henryavila/mdprobe 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,262 @@
1
+ import DiffMatchPatch from 'diff-match-patch'
2
+
3
+ /**
4
+ * Convert 1-indexed line:column to 0-indexed character offset in source.
5
+ */
6
+ function lineColumnToOffset(source, line, column) {
7
+ const lines = source.split('\n')
8
+ let offset = 0
9
+ for (let i = 0; i < line - 1 && i < lines.length; i++) {
10
+ offset += lines[i].length + 1
11
+ }
12
+ offset += column - 1
13
+ return offset
14
+ }
15
+
16
+ /**
17
+ * Convert 0-indexed character offset to 1-indexed line:column.
18
+ */
19
+ function offsetToLineColumn(source, offset) {
20
+ let line = 1
21
+ let col = 1
22
+ for (let i = 0; i < offset && i < source.length; i++) {
23
+ if (source[i] === '\n') {
24
+ line++
25
+ col = 1
26
+ } else {
27
+ col++
28
+ }
29
+ }
30
+ return { line, column: col }
31
+ }
32
+
33
+ /**
34
+ * Extract text from source at the given position for the exact number of characters.
35
+ */
36
+ function extractTextByLength(source, startLine, startColumn, length) {
37
+ const lines = source.split('\n')
38
+ if (startLine < 1 || startLine > lines.length) return null
39
+
40
+ const startOffset = lineColumnToOffset(source, startLine, startColumn)
41
+ if (startOffset >= source.length) return null
42
+ return source.slice(startOffset, startOffset + length)
43
+ }
44
+
45
+ /**
46
+ * Find all occurrences of a substring in source. Returns array of character offsets.
47
+ */
48
+ function findAllOccurrences(source, text) {
49
+ const results = []
50
+ let idx = source.indexOf(text)
51
+ while (idx !== -1) {
52
+ results.push(idx)
53
+ idx = source.indexOf(text, idx + 1)
54
+ }
55
+ return results
56
+ }
57
+
58
+ /**
59
+ * Score how well surrounding text at a candidate position matches expected prefix/suffix.
60
+ */
61
+ function scorePrefixSuffix(source, matchOffset, textLength, expectedPrefix, expectedSuffix) {
62
+ let score = 0
63
+
64
+ if (expectedPrefix) {
65
+ const actualPrefix = source.slice(Math.max(0, matchOffset - expectedPrefix.length), matchOffset)
66
+ for (let i = 0; i < Math.min(actualPrefix.length, expectedPrefix.length); i++) {
67
+ if (actualPrefix[actualPrefix.length - 1 - i] === expectedPrefix[expectedPrefix.length - 1 - i]) {
68
+ score++
69
+ }
70
+ }
71
+ }
72
+
73
+ if (expectedSuffix) {
74
+ const endOffset = matchOffset + textLength
75
+ const actualSuffix = source.slice(endOffset, endOffset + expectedSuffix.length)
76
+ for (let i = 0; i < Math.min(actualSuffix.length, expectedSuffix.length); i++) {
77
+ if (actualSuffix[i] === expectedSuffix[i]) {
78
+ score++
79
+ }
80
+ }
81
+ }
82
+
83
+ return score
84
+ }
85
+
86
+ /**
87
+ * Convert a character offset + text length to a position object.
88
+ */
89
+ function offsetToPosition(source, offset, textLength) {
90
+ const start = offsetToLineColumn(source, offset)
91
+ const end = offsetToLineColumn(source, offset + textLength)
92
+ return {
93
+ startLine: start.line,
94
+ startColumn: start.column,
95
+ endLine: end.line,
96
+ endColumn: end.column,
97
+ }
98
+ }
99
+
100
+ /**
101
+ * Perform fuzzy match using diff-match-patch, handling patterns longer than 32 chars.
102
+ * diff-match-patch's match_main uses bitap which has a 32-char limit.
103
+ * For longer patterns, use a truncated pattern to find approximate location,
104
+ * then verify the full match via Levenshtein distance.
105
+ */
106
+ function fuzzyMatch(currentSource, exactText, hintOffset) {
107
+ const dmp = new DiffMatchPatch()
108
+ dmp.Match_Threshold = 0.4
109
+ dmp.Match_Distance = 1000
110
+
111
+ const MAX_PATTERN = 32
112
+
113
+ if (exactText.length <= MAX_PATTERN) {
114
+ return dmp.match_main(currentSource, exactText, hintOffset)
115
+ }
116
+
117
+ // For longer patterns, find approximate location using truncated pattern
118
+ const truncated = exactText.slice(0, MAX_PATTERN)
119
+ const idx = dmp.match_main(currentSource, truncated, hintOffset)
120
+ if (idx === -1) return -1
121
+
122
+ // Verify full match quality via Levenshtein distance
123
+ const candidate = currentSource.slice(idx, idx + exactText.length)
124
+ if (candidate.length === 0) return -1
125
+
126
+ const diffs = dmp.diff_main(exactText, candidate)
127
+ const distance = dmp.diff_levenshtein(diffs)
128
+ const threshold = Math.floor(exactText.length * 0.4)
129
+
130
+ return distance <= threshold ? idx : -1
131
+ }
132
+
133
+ /**
134
+ * Creates a TextQuoteSelector from a text selection.
135
+ */
136
+ export function createSelector({ exact, startLine, startColumn, endLine, endColumn, source }) {
137
+ const startOffset = lineColumnToOffset(source, startLine, startColumn)
138
+ const endOffset = lineColumnToOffset(source, endLine, endColumn)
139
+
140
+ const prefix = source.slice(Math.max(0, startOffset - 30), startOffset)
141
+ const suffix = source.slice(endOffset, endOffset + 30)
142
+
143
+ return {
144
+ position: { startLine, startColumn, endLine, endColumn },
145
+ quote: { exact, prefix, suffix },
146
+ }
147
+ }
148
+
149
+ /**
150
+ * Attempts to anchor an annotation to the current source using a fallback chain:
151
+ * 1. Position match (fast path)
152
+ * 2. Exact quote match with prefix/suffix disambiguation
153
+ * 3. Fuzzy match (diff-match-patch)
154
+ * 4. Orphan
155
+ */
156
+ export function anchor(annotation, currentSource) {
157
+ const { position, quote } = annotation.selectors
158
+
159
+ // 1. Position match (fast path)
160
+ if (position && quote && quote.exact) {
161
+ const textAtPosition = extractTextByLength(
162
+ currentSource,
163
+ position.startLine,
164
+ position.startColumn,
165
+ quote.exact.length,
166
+ )
167
+ if (textAtPosition !== null && textAtPosition === quote.exact) {
168
+ // For multi-line selections, extractTextByLength already verified the
169
+ // exact text at the position. No further boundary check needed.
170
+ if (position.startLine !== position.endLine) {
171
+ return { status: 'position', position: { ...position } }
172
+ }
173
+ // For single-line: verify boundary consistency to ensure the text at
174
+ // this position hasn't been extended (e.g., "cat" matching "category").
175
+ const lines = currentSource.split('\n')
176
+ const lineStr = lines[position.startLine - 1]
177
+ if (lineStr != null) {
178
+ const endColIdx = position.endColumn - 1
179
+ const lineBased = lineStr.slice(position.startColumn - 1, endColIdx)
180
+ const textToLineEnd = lineStr.slice(position.startColumn - 1)
181
+
182
+ if (textToLineEnd === quote.exact || lineBased === quote.exact) {
183
+ return { status: 'position', position: { ...position } }
184
+ }
185
+ }
186
+ }
187
+ }
188
+
189
+ // 2. Exact quote match
190
+ if (quote && quote.exact) {
191
+ const occurrences = findAllOccurrences(currentSource, quote.exact)
192
+
193
+ if (occurrences.length === 1) {
194
+ const pos = offsetToPosition(currentSource, occurrences[0], quote.exact.length)
195
+ return { status: 'exact', position: pos }
196
+ }
197
+
198
+ if (occurrences.length > 1) {
199
+ // Disambiguate using prefix/suffix scoring
200
+ let bestIdx = -1
201
+ let bestScore = -1
202
+ let bestDistance = Infinity
203
+
204
+ const origOffset = position
205
+ ? lineColumnToOffset(currentSource, position.startLine, position.startColumn)
206
+ : 0
207
+
208
+ for (let i = 0; i < occurrences.length; i++) {
209
+ const score = scorePrefixSuffix(
210
+ currentSource,
211
+ occurrences[i],
212
+ quote.exact.length,
213
+ quote.prefix,
214
+ quote.suffix,
215
+ )
216
+ const distance = Math.abs(occurrences[i] - origOffset)
217
+
218
+ if (score > bestScore || (score === bestScore && distance < bestDistance)) {
219
+ bestScore = score
220
+ bestIdx = i
221
+ bestDistance = distance
222
+ }
223
+ }
224
+
225
+ if (bestIdx >= 0) {
226
+ const pos = offsetToPosition(currentSource, occurrences[bestIdx], quote.exact.length)
227
+ return { status: 'exact', position: pos }
228
+ }
229
+ }
230
+ }
231
+
232
+ // 3. Fuzzy match (diff-match-patch)
233
+ if (quote && quote.exact && currentSource.length > 0) {
234
+ let loc = 0
235
+ if (position) {
236
+ loc = lineColumnToOffset(currentSource, position.startLine, position.startColumn)
237
+ if (loc > currentSource.length) loc = currentSource.length
238
+ if (loc < 0) loc = 0
239
+ }
240
+
241
+ const matchIndex = fuzzyMatch(currentSource, quote.exact, loc)
242
+
243
+ if (matchIndex !== -1) {
244
+ const pos = offsetToPosition(currentSource, matchIndex, quote.exact.length)
245
+ return { status: 'fuzzy', position: pos }
246
+ }
247
+ }
248
+
249
+ // 4. Orphan
250
+ return { status: 'orphan', position: null }
251
+ }
252
+
253
+ /**
254
+ * Anchors all annotations. Returns Map<annotationId, {status, position}>.
255
+ */
256
+ export function reanchorAll(annotations, currentSource) {
257
+ const results = new Map()
258
+ for (const ann of annotations) {
259
+ results.set(ann.id, anchor(ann, currentSource))
260
+ }
261
+ return results
262
+ }