clanka 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ChunkRepo.d.ts +0 -1
- package/dist/ChunkRepo.d.ts.map +1 -1
- package/dist/ChunkRepo.js +0 -10
- package/dist/ChunkRepo.js.map +1 -1
- package/dist/CodeChunker.d.ts +8 -1
- package/dist/CodeChunker.d.ts.map +1 -1
- package/dist/CodeChunker.js +331 -55
- package/dist/CodeChunker.js.map +1 -1
- package/dist/CodeChunker.test.js +231 -28
- package/dist/CodeChunker.test.js.map +1 -1
- package/dist/SemanticSearch.d.ts +2 -3
- package/dist/SemanticSearch.d.ts.map +1 -1
- package/dist/SemanticSearch.js +36 -15
- package/dist/SemanticSearch.js.map +1 -1
- package/package.json +13 -10
- package/src/ChunkRepo.ts +1 -12
- package/src/CodeChunker.test.ts +253 -32
- package/src/CodeChunker.ts +499 -70
- package/src/SemanticSearch.ts +45 -17
- package/src/fixtures/fiber.txt +255 -0
package/src/CodeChunker.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @since 1.0.0
|
|
3
3
|
*/
|
|
4
|
-
import { createHash } from "node:crypto"
|
|
5
4
|
import * as Array from "effect/Array"
|
|
6
5
|
import * as Effect from "effect/Effect"
|
|
7
6
|
import * as FileSystem from "effect/FileSystem"
|
|
@@ -12,6 +11,9 @@ import * as ServiceMap from "effect/ServiceMap"
|
|
|
12
11
|
import * as Stream from "effect/Stream"
|
|
13
12
|
import * as ChildProcess from "effect/unstable/process/ChildProcess"
|
|
14
13
|
import * as ChildProcessSpawner from "effect/unstable/process/ChildProcessSpawner"
|
|
14
|
+
import TreeSitter, { type SyntaxNode } from "tree-sitter"
|
|
15
|
+
import TreeSitterJavaScript from "tree-sitter-javascript"
|
|
16
|
+
import TreeSitterTypeScript from "tree-sitter-typescript"
|
|
15
17
|
|
|
16
18
|
/**
|
|
17
19
|
* @since 1.0.0
|
|
@@ -21,10 +23,25 @@ export interface CodeChunk {
|
|
|
21
23
|
readonly path: string
|
|
22
24
|
readonly startLine: number
|
|
23
25
|
readonly endLine: number
|
|
24
|
-
readonly
|
|
26
|
+
readonly name: string | undefined
|
|
27
|
+
readonly type: ChunkType | undefined
|
|
28
|
+
readonly parent: string | undefined
|
|
25
29
|
readonly content: string
|
|
26
30
|
}
|
|
27
31
|
|
|
32
|
+
/**
|
|
33
|
+
* @since 1.0.0
|
|
34
|
+
* @category Models
|
|
35
|
+
*/
|
|
36
|
+
export type ChunkType =
|
|
37
|
+
| "function"
|
|
38
|
+
| "method"
|
|
39
|
+
| "class"
|
|
40
|
+
| "interface"
|
|
41
|
+
| "type-alias"
|
|
42
|
+
| "enum"
|
|
43
|
+
| "variable"
|
|
44
|
+
|
|
28
45
|
/**
|
|
29
46
|
* @since 1.0.0
|
|
30
47
|
* @category Services
|
|
@@ -74,8 +91,6 @@ const sourceExtensions = new Set([
|
|
|
74
91
|
"ini",
|
|
75
92
|
"java",
|
|
76
93
|
"js",
|
|
77
|
-
"json",
|
|
78
|
-
"jsonc",
|
|
79
94
|
"jsx",
|
|
80
95
|
"kt",
|
|
81
96
|
"kts",
|
|
@@ -84,7 +99,6 @@ const sourceExtensions = new Set([
|
|
|
84
99
|
"mjs",
|
|
85
100
|
"mts",
|
|
86
101
|
"php",
|
|
87
|
-
"properties",
|
|
88
102
|
"py",
|
|
89
103
|
"rb",
|
|
90
104
|
"rs",
|
|
@@ -95,13 +109,10 @@ const sourceExtensions = new Set([
|
|
|
95
109
|
"sql",
|
|
96
110
|
"svelte",
|
|
97
111
|
"swift",
|
|
98
|
-
"toml",
|
|
99
112
|
"ts",
|
|
100
113
|
"tsx",
|
|
101
114
|
"vue",
|
|
102
115
|
"xml",
|
|
103
|
-
"yaml",
|
|
104
|
-
"yml",
|
|
105
116
|
"zsh",
|
|
106
117
|
])
|
|
107
118
|
|
|
@@ -114,29 +125,6 @@ const documentationExtensions = new Set([
|
|
|
114
125
|
"txt",
|
|
115
126
|
])
|
|
116
127
|
|
|
117
|
-
const allowedBareFileNames = new Set([
|
|
118
|
-
".editorconfig",
|
|
119
|
-
".gitignore",
|
|
120
|
-
".npmrc",
|
|
121
|
-
".nvmrc",
|
|
122
|
-
"dockerfile",
|
|
123
|
-
"justfile",
|
|
124
|
-
"license",
|
|
125
|
-
"makefile",
|
|
126
|
-
"readme",
|
|
127
|
-
])
|
|
128
|
-
|
|
129
|
-
const ignoredFileNames = new Set([
|
|
130
|
-
"bun.lock",
|
|
131
|
-
"bun.lockb",
|
|
132
|
-
"cargo.lock",
|
|
133
|
-
"composer.lock",
|
|
134
|
-
"package-lock.json",
|
|
135
|
-
"pnpm-lock.yaml",
|
|
136
|
-
"poetry.lock",
|
|
137
|
-
"yarn.lock",
|
|
138
|
-
])
|
|
139
|
-
|
|
140
128
|
const ignoredDirectories = new Set([
|
|
141
129
|
".git",
|
|
142
130
|
".next",
|
|
@@ -155,14 +143,36 @@ const normalizePath = (path: string): string => path.replace(/\\/g, "/")
|
|
|
155
143
|
const normalizeText = (content: string): string =>
|
|
156
144
|
content.replace(/\r\n/g, "\n").replace(/\r/g, "\n")
|
|
157
145
|
|
|
158
|
-
const hashContent = (content: string): string =>
|
|
159
|
-
createHash("sha256").update(content).digest("hex")
|
|
160
|
-
|
|
161
146
|
const meaningfulLinePattern = /[^\s\p{P}]/u
|
|
162
147
|
|
|
163
148
|
const isMeaningfulLine = (line: string): boolean =>
|
|
164
149
|
meaningfulLinePattern.test(line)
|
|
165
150
|
|
|
151
|
+
interface LineRange {
|
|
152
|
+
readonly startLine: number
|
|
153
|
+
readonly endLine: number
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
interface ChunkSettings {
|
|
157
|
+
readonly chunkSize: number
|
|
158
|
+
readonly chunkOverlap: number
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
interface ChunkRange extends LineRange {
|
|
162
|
+
readonly name: string | undefined
|
|
163
|
+
readonly type: ChunkType | undefined
|
|
164
|
+
readonly parent: string | undefined
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const languageByExtension = new Map<string, unknown>([
|
|
168
|
+
["js", TreeSitterJavaScript],
|
|
169
|
+
["jsx", TreeSitterJavaScript],
|
|
170
|
+
["ts", TreeSitterTypeScript.typescript],
|
|
171
|
+
["tsx", TreeSitterTypeScript.tsx],
|
|
172
|
+
])
|
|
173
|
+
|
|
174
|
+
// const ignoredTopLevelNodeTypes = new Set(["comment", "import_statement"])
|
|
175
|
+
|
|
166
176
|
/**
|
|
167
177
|
* @since 1.0.0
|
|
168
178
|
* @category Predicates
|
|
@@ -205,22 +215,10 @@ export const isMeaningfulFile = (path: string): boolean => {
|
|
|
205
215
|
return false
|
|
206
216
|
}
|
|
207
217
|
|
|
208
|
-
if (ignoredFileNames.has(fileName)) {
|
|
209
|
-
return false
|
|
210
|
-
}
|
|
211
|
-
|
|
212
218
|
if (/\.min\.(?:css|js)$/i.test(fileName)) {
|
|
213
219
|
return false
|
|
214
220
|
}
|
|
215
221
|
|
|
216
|
-
if (fileName.endsWith(".map")) {
|
|
217
|
-
return false
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
if (allowedBareFileNames.has(fileName)) {
|
|
221
|
-
return true
|
|
222
|
-
}
|
|
223
|
-
|
|
224
222
|
const extensionIndex = fileName.lastIndexOf(".")
|
|
225
223
|
if (extensionIndex === -1) {
|
|
226
224
|
return false
|
|
@@ -235,7 +233,7 @@ export const isMeaningfulFile = (path: string): boolean => {
|
|
|
235
233
|
const resolveChunkSettings = (options: {
|
|
236
234
|
readonly chunkSize: number
|
|
237
235
|
readonly chunkOverlap: number
|
|
238
|
-
}) => {
|
|
236
|
+
}): ChunkSettings => {
|
|
239
237
|
const chunkSize = Math.max(1, options.chunkSize)
|
|
240
238
|
const chunkOverlap = Math.max(
|
|
241
239
|
0,
|
|
@@ -248,33 +246,420 @@ const resolveChunkSettings = (options: {
|
|
|
248
246
|
}
|
|
249
247
|
}
|
|
250
248
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
249
|
+
const getPathExtension = (path: string): string | undefined => {
|
|
250
|
+
const fileName = path.split("/").at(-1)
|
|
251
|
+
if (fileName === undefined) {
|
|
252
|
+
return undefined
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const extensionIndex = fileName.lastIndexOf(".")
|
|
256
|
+
if (extensionIndex === -1) {
|
|
257
|
+
return undefined
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
return fileName.slice(extensionIndex + 1).toLowerCase()
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const resolveAstLanguage = (path: string): unknown => {
|
|
264
|
+
const extension = getPathExtension(path)
|
|
265
|
+
if (extension === undefined) {
|
|
266
|
+
return undefined
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return languageByExtension.get(extension)
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
const lineRangeFromNode = (node: SyntaxNode): LineRange => {
|
|
273
|
+
const startLine = node.startPosition.row + 1
|
|
274
|
+
const endLine = Math.max(startLine, node.endPosition.row + 1)
|
|
275
|
+
return {
|
|
276
|
+
startLine,
|
|
277
|
+
endLine,
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
const hasOnlyWhitespaceLines = (
|
|
282
|
+
lines: ReadonlyArray<string>,
|
|
283
|
+
startLine: number,
|
|
284
|
+
endLine: number,
|
|
285
|
+
): boolean => {
|
|
286
|
+
if (startLine > endLine) {
|
|
287
|
+
return true
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
for (let lineIndex = startLine; lineIndex <= endLine; lineIndex++) {
|
|
291
|
+
if ((lines[lineIndex - 1] ?? "").trim().length > 0) {
|
|
292
|
+
return false
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
return true
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
const lineRangeWithLeadingComments = (
|
|
300
|
+
node: SyntaxNode,
|
|
301
|
+
siblings: ReadonlyArray<SyntaxNode>,
|
|
302
|
+
nodeIndex: number,
|
|
303
|
+
lines: ReadonlyArray<string>,
|
|
304
|
+
): LineRange => {
|
|
305
|
+
const baseRange = lineRangeFromNode(node)
|
|
306
|
+
let startLine = baseRange.startLine
|
|
307
|
+
|
|
308
|
+
for (let index = nodeIndex - 1; index >= 0; index--) {
|
|
309
|
+
const sibling = siblings[index]!
|
|
310
|
+
if (sibling.type !== "comment") {
|
|
311
|
+
break
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
const commentRange = lineRangeFromNode(sibling)
|
|
315
|
+
if (
|
|
316
|
+
!hasOnlyWhitespaceLines(lines, commentRange.endLine + 1, startLine - 1)
|
|
317
|
+
) {
|
|
318
|
+
break
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
startLine = commentRange.startLine
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
return {
|
|
325
|
+
startLine,
|
|
326
|
+
endLine: baseRange.endLine,
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const normalizeLineRange = (
|
|
331
|
+
range: LineRange,
|
|
332
|
+
lineCount: number,
|
|
333
|
+
): LineRange | undefined => {
|
|
334
|
+
const startLine = Math.max(1, Math.min(lineCount, range.startLine))
|
|
335
|
+
const endLine = Math.max(1, Math.min(lineCount, range.endLine))
|
|
336
|
+
|
|
337
|
+
if (endLine < startLine) {
|
|
338
|
+
return undefined
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
return {
|
|
342
|
+
startLine,
|
|
343
|
+
endLine,
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
const splitRange = (
|
|
348
|
+
range: LineRange,
|
|
349
|
+
settings: ChunkSettings,
|
|
350
|
+
): ReadonlyArray<LineRange> => {
|
|
351
|
+
const lineCount = range.endLine - range.startLine + 1
|
|
352
|
+
if (lineCount <= settings.chunkSize) {
|
|
353
|
+
return [range]
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
const step = settings.chunkSize - settings.chunkOverlap
|
|
357
|
+
const out = [] as Array<LineRange>
|
|
358
|
+
|
|
359
|
+
for (
|
|
360
|
+
let startLine = range.startLine;
|
|
361
|
+
startLine <= range.endLine;
|
|
362
|
+
startLine += step
|
|
363
|
+
) {
|
|
364
|
+
const endLine = Math.min(range.endLine, startLine + settings.chunkSize - 1)
|
|
365
|
+
out.push({
|
|
366
|
+
startLine,
|
|
367
|
+
endLine,
|
|
368
|
+
})
|
|
369
|
+
|
|
370
|
+
if (endLine >= range.endLine) {
|
|
371
|
+
break
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return out
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
const nodeText = (node: SyntaxNode | null): string | undefined => {
|
|
379
|
+
if (node === null) {
|
|
380
|
+
return undefined
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
const value = node.text.trim().replace(/\s+/g, " ")
|
|
384
|
+
return value.length === 0 ? undefined : value
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
const nodeFieldText = (
|
|
388
|
+
node: SyntaxNode,
|
|
389
|
+
fieldName: string,
|
|
390
|
+
): string | undefined => nodeText(node.childForFieldName(fieldName))
|
|
391
|
+
|
|
392
|
+
const unwrapExportNode = (node: SyntaxNode): SyntaxNode => {
|
|
393
|
+
if (node.type !== "export_statement") {
|
|
394
|
+
return node
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
return node.childForFieldName("declaration") ?? node
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
const variableDeclarators = (node: SyntaxNode): ReadonlyArray<SyntaxNode> =>
|
|
401
|
+
node.namedChildren.filter((child) => child.type === "variable_declarator")
|
|
402
|
+
|
|
403
|
+
const variableTypeFromDeclarator = (node: SyntaxNode): ChunkType => {
|
|
404
|
+
const value = node.childForFieldName("value")
|
|
405
|
+
if (value !== null && value.type.includes("function")) {
|
|
406
|
+
return "function"
|
|
407
|
+
}
|
|
408
|
+
return "variable"
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
const variableTypeFromDeclaration = (node: SyntaxNode): ChunkType => {
|
|
412
|
+
const declarators = variableDeclarators(node)
|
|
413
|
+
if (
|
|
414
|
+
declarators.some(
|
|
415
|
+
(declarator) => variableTypeFromDeclarator(declarator) === "function",
|
|
416
|
+
)
|
|
417
|
+
) {
|
|
418
|
+
return "function"
|
|
419
|
+
}
|
|
420
|
+
return "variable"
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
const chunkTypeFromNode = (node: SyntaxNode): ChunkType | undefined => {
|
|
424
|
+
switch (node.type) {
|
|
425
|
+
case "class_declaration":
|
|
426
|
+
return "class"
|
|
427
|
+
case "enum_declaration":
|
|
428
|
+
return "enum"
|
|
429
|
+
case "function_declaration":
|
|
430
|
+
case "generator_function_declaration":
|
|
431
|
+
return "function"
|
|
432
|
+
case "interface_declaration":
|
|
433
|
+
return "interface"
|
|
434
|
+
case "generator_method_definition":
|
|
435
|
+
case "method_definition":
|
|
436
|
+
return "method"
|
|
437
|
+
case "type_alias_declaration":
|
|
438
|
+
return "type-alias"
|
|
439
|
+
case "lexical_declaration":
|
|
440
|
+
case "variable_declaration":
|
|
441
|
+
return variableTypeFromDeclaration(node)
|
|
442
|
+
case "variable_declarator":
|
|
443
|
+
return variableTypeFromDeclarator(node)
|
|
444
|
+
default:
|
|
445
|
+
return undefined
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
const variableNamesFromDeclaration = (node: SyntaxNode): string | undefined => {
|
|
450
|
+
const names = variableDeclarators(node)
|
|
451
|
+
.map((declarator) => nodeFieldText(declarator, "name"))
|
|
452
|
+
.filter((name): name is string => name !== undefined)
|
|
453
|
+
|
|
454
|
+
if (names.length === 0) {
|
|
455
|
+
return undefined
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
return names.join(", ")
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
const nameFromNode = (node: SyntaxNode): string | undefined => {
|
|
462
|
+
switch (node.type) {
|
|
463
|
+
case "class_declaration":
|
|
464
|
+
case "enum_declaration":
|
|
465
|
+
case "function_declaration":
|
|
466
|
+
case "generator_function_declaration":
|
|
467
|
+
case "interface_declaration":
|
|
468
|
+
case "generator_method_definition":
|
|
469
|
+
case "method_definition":
|
|
470
|
+
case "type_alias_declaration":
|
|
471
|
+
case "variable_declarator":
|
|
472
|
+
return nodeFieldText(node, "name")
|
|
473
|
+
case "lexical_declaration":
|
|
474
|
+
case "variable_declaration":
|
|
475
|
+
return variableNamesFromDeclaration(node)
|
|
476
|
+
default:
|
|
477
|
+
return undefined
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
const formatParent = (
|
|
482
|
+
type: ChunkType | undefined,
|
|
483
|
+
name: string | undefined,
|
|
484
|
+
): string | undefined => {
|
|
485
|
+
if (type === undefined && name === undefined) {
|
|
486
|
+
return undefined
|
|
487
|
+
}
|
|
488
|
+
if (type === undefined) {
|
|
489
|
+
return name
|
|
490
|
+
}
|
|
491
|
+
if (name === undefined) {
|
|
492
|
+
return type
|
|
493
|
+
}
|
|
494
|
+
return type + " " + name
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
const collectClassMethodRanges = (
|
|
498
|
+
classNode: SyntaxNode,
|
|
499
|
+
parent: string | undefined,
|
|
500
|
+
lines: ReadonlyArray<string>,
|
|
501
|
+
): ReadonlyArray<ChunkRange> => {
|
|
502
|
+
const body = classNode.childForFieldName("body")
|
|
503
|
+
if (body === null) {
|
|
504
|
+
return []
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
const out = [] as Array<ChunkRange>
|
|
508
|
+
for (let index = 0; index < body.namedChildren.length; index++) {
|
|
509
|
+
const child = body.namedChildren[index]!
|
|
510
|
+
if (!child.type.includes("method")) {
|
|
511
|
+
continue
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
out.push({
|
|
515
|
+
...lineRangeWithLeadingComments(child, body.namedChildren, index, lines),
|
|
516
|
+
name: nameFromNode(child),
|
|
517
|
+
type: chunkTypeFromNode(child),
|
|
518
|
+
parent,
|
|
519
|
+
})
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
return out
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
const collectAstRanges = (
|
|
256
526
|
path: string,
|
|
257
527
|
content: string,
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
): ReadonlyArray<CodeChunk> => {
|
|
263
|
-
if (content.trim().length === 0 || isProbablyMinified(content)) {
|
|
528
|
+
lines: ReadonlyArray<string>,
|
|
529
|
+
): ReadonlyArray<ChunkRange> => {
|
|
530
|
+
const language = resolveAstLanguage(path)
|
|
531
|
+
if (language === undefined) {
|
|
264
532
|
return []
|
|
265
533
|
}
|
|
266
534
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
535
|
+
try {
|
|
536
|
+
const parser = new TreeSitter()
|
|
537
|
+
parser.setLanguage(language)
|
|
538
|
+
const tree = parser.parse(content, undefined, {
|
|
539
|
+
bufferSize: 1024 * 1024,
|
|
540
|
+
})
|
|
541
|
+
const out = [] as Array<ChunkRange>
|
|
542
|
+
|
|
543
|
+
const topLevelNodes = tree.rootNode.namedChildren
|
|
544
|
+
for (let index = 0; index < topLevelNodes.length; index++) {
|
|
545
|
+
const topLevelNode = topLevelNodes[index]!
|
|
546
|
+
if (
|
|
547
|
+
topLevelNode.type === "comment" ||
|
|
548
|
+
topLevelNode.type.includes("import")
|
|
549
|
+
) {
|
|
550
|
+
continue
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
const declarationNode = unwrapExportNode(topLevelNode)
|
|
554
|
+
const type = chunkTypeFromNode(declarationNode)
|
|
555
|
+
const name = nameFromNode(declarationNode)
|
|
556
|
+
|
|
557
|
+
out.push({
|
|
558
|
+
...lineRangeWithLeadingComments(
|
|
559
|
+
topLevelNode,
|
|
560
|
+
topLevelNodes,
|
|
561
|
+
index,
|
|
562
|
+
lines,
|
|
563
|
+
),
|
|
564
|
+
name,
|
|
565
|
+
type,
|
|
566
|
+
parent: undefined,
|
|
567
|
+
})
|
|
568
|
+
|
|
569
|
+
if (declarationNode.type === "class_declaration") {
|
|
570
|
+
const parent = formatParent(type, name)
|
|
571
|
+
out.push(...collectClassMethodRanges(declarationNode, parent, lines))
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
return out
|
|
576
|
+
} catch {
|
|
274
577
|
return []
|
|
275
578
|
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
const chunksFromRanges = (
|
|
582
|
+
path: string,
|
|
583
|
+
lines: ReadonlyArray<string>,
|
|
584
|
+
ranges: ReadonlyArray<ChunkRange>,
|
|
585
|
+
settings: ChunkSettings,
|
|
586
|
+
): ReadonlyArray<CodeChunk> => {
|
|
587
|
+
const hasMethodChildRange = (
|
|
588
|
+
classRange: LineRange & { readonly name: string | undefined },
|
|
589
|
+
) => {
|
|
590
|
+
const parent = formatParent("class", classRange.name)
|
|
591
|
+
return ranges.some(
|
|
592
|
+
(range) =>
|
|
593
|
+
range.type === "method" &&
|
|
594
|
+
range.parent === parent &&
|
|
595
|
+
range.startLine >= classRange.startLine &&
|
|
596
|
+
range.endLine <= classRange.endLine,
|
|
597
|
+
)
|
|
598
|
+
}
|
|
276
599
|
|
|
277
|
-
const
|
|
600
|
+
const out = [] as Array<CodeChunk>
|
|
601
|
+
const seen = new Set<string>()
|
|
602
|
+
|
|
603
|
+
for (const range of ranges) {
|
|
604
|
+
const normalizedRange = normalizeLineRange(range, lines.length)
|
|
605
|
+
if (normalizedRange === undefined) {
|
|
606
|
+
continue
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
const allSegments = splitRange(normalizedRange, settings)
|
|
610
|
+
const segments =
|
|
611
|
+
range.type === "class" &&
|
|
612
|
+
allSegments.length > 1 &&
|
|
613
|
+
hasMethodChildRange({ ...normalizedRange, name: range.name })
|
|
614
|
+
? [allSegments[0]!]
|
|
615
|
+
: allSegments
|
|
616
|
+
|
|
617
|
+
for (const segment of segments) {
|
|
618
|
+
const key =
|
|
619
|
+
String(segment.startLine) +
|
|
620
|
+
":" +
|
|
621
|
+
String(segment.endLine) +
|
|
622
|
+
":" +
|
|
623
|
+
(range.name ?? "") +
|
|
624
|
+
":" +
|
|
625
|
+
(range.type ?? "") +
|
|
626
|
+
":" +
|
|
627
|
+
(range.parent ?? "")
|
|
628
|
+
if (seen.has(key)) {
|
|
629
|
+
continue
|
|
630
|
+
}
|
|
631
|
+
seen.add(key)
|
|
632
|
+
|
|
633
|
+
const chunkLines = lines.slice(segment.startLine - 1, segment.endLine)
|
|
634
|
+
if (!chunkLines.some(isMeaningfulLine)) {
|
|
635
|
+
continue
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
out.push({
|
|
639
|
+
path,
|
|
640
|
+
startLine: segment.startLine,
|
|
641
|
+
endLine: segment.endLine,
|
|
642
|
+
name: range.name,
|
|
643
|
+
type: range.type,
|
|
644
|
+
parent: range.parent,
|
|
645
|
+
content: chunkLines.join("\n"),
|
|
646
|
+
})
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
return out.toSorted(
|
|
651
|
+
(left, right) =>
|
|
652
|
+
left.startLine - right.startLine ||
|
|
653
|
+
left.endLine - right.endLine ||
|
|
654
|
+
(left.name ?? "").localeCompare(right.name ?? ""),
|
|
655
|
+
)
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
const chunkWithLineWindows = (
|
|
659
|
+
path: string,
|
|
660
|
+
lines: ReadonlyArray<string>,
|
|
661
|
+
settings: ChunkSettings,
|
|
662
|
+
): ReadonlyArray<CodeChunk> => {
|
|
278
663
|
const step = settings.chunkSize - settings.chunkOverlap
|
|
279
664
|
const out = [] as Array<CodeChunk>
|
|
280
665
|
|
|
@@ -287,14 +672,15 @@ export const chunkFileContent = (
|
|
|
287
672
|
const start = index
|
|
288
673
|
const end = Math.min(lines.length, start + settings.chunkSize)
|
|
289
674
|
const chunkLines = lines.slice(start, end)
|
|
290
|
-
const chunkContent = chunkLines.join("\n")
|
|
291
675
|
|
|
292
676
|
out.push({
|
|
293
|
-
path
|
|
677
|
+
path,
|
|
294
678
|
startLine: start + 1,
|
|
295
679
|
endLine: end,
|
|
296
|
-
|
|
297
|
-
|
|
680
|
+
name: undefined,
|
|
681
|
+
type: undefined,
|
|
682
|
+
parent: undefined,
|
|
683
|
+
content: chunkLines.join("\n"),
|
|
298
684
|
})
|
|
299
685
|
|
|
300
686
|
index += step
|
|
@@ -307,6 +693,49 @@ export const chunkFileContent = (
|
|
|
307
693
|
return out
|
|
308
694
|
}
|
|
309
695
|
|
|
696
|
+
/**
|
|
697
|
+
* @since 1.0.0
|
|
698
|
+
* @category Constructors
|
|
699
|
+
*/
|
|
700
|
+
export const chunkFileContent = (
|
|
701
|
+
path: string,
|
|
702
|
+
content: string,
|
|
703
|
+
options: {
|
|
704
|
+
readonly chunkSize: number
|
|
705
|
+
readonly chunkOverlap: number
|
|
706
|
+
},
|
|
707
|
+
): ReadonlyArray<CodeChunk> => {
|
|
708
|
+
if (content.trim().length === 0 || isProbablyMinified(content)) {
|
|
709
|
+
return []
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
const normalizedPath = normalizePath(path)
|
|
713
|
+
const normalizedContent = normalizeText(content)
|
|
714
|
+
const lines = normalizedContent.split("\n")
|
|
715
|
+
if (lines.at(-1) === "") {
|
|
716
|
+
lines.pop()
|
|
717
|
+
}
|
|
718
|
+
if (lines.length === 0) {
|
|
719
|
+
return []
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
const settings = resolveChunkSettings(options)
|
|
723
|
+
const astRanges = collectAstRanges(normalizedPath, normalizedContent, lines)
|
|
724
|
+
if (astRanges.length > 0) {
|
|
725
|
+
const astChunks = chunksFromRanges(
|
|
726
|
+
normalizedPath,
|
|
727
|
+
lines,
|
|
728
|
+
astRanges,
|
|
729
|
+
settings,
|
|
730
|
+
)
|
|
731
|
+
if (astChunks.length > 0) {
|
|
732
|
+
return astChunks
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
return chunkWithLineWindows(normalizedPath, lines, settings)
|
|
737
|
+
}
|
|
738
|
+
|
|
310
739
|
/**
|
|
311
740
|
* @since 1.0.0
|
|
312
741
|
* @category Layers
|