clanka 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,6 @@
1
1
  /**
2
2
  * @since 1.0.0
3
3
  */
4
- import { createHash } from "node:crypto"
5
4
  import * as Array from "effect/Array"
6
5
  import * as Effect from "effect/Effect"
7
6
  import * as FileSystem from "effect/FileSystem"
@@ -12,6 +11,9 @@ import * as ServiceMap from "effect/ServiceMap"
12
11
  import * as Stream from "effect/Stream"
13
12
  import * as ChildProcess from "effect/unstable/process/ChildProcess"
14
13
  import * as ChildProcessSpawner from "effect/unstable/process/ChildProcessSpawner"
14
+ import TreeSitter, { type SyntaxNode } from "tree-sitter"
15
+ import TreeSitterJavaScript from "tree-sitter-javascript"
16
+ import TreeSitterTypeScript from "tree-sitter-typescript"
15
17
 
16
18
  /**
17
19
  * @since 1.0.0
@@ -21,10 +23,25 @@ export interface CodeChunk {
21
23
  readonly path: string
22
24
  readonly startLine: number
23
25
  readonly endLine: number
24
- readonly contentHash: string
26
+ readonly name: string | undefined
27
+ readonly type: ChunkType | undefined
28
+ readonly parent: string | undefined
25
29
  readonly content: string
26
30
  }
27
31
 
32
+ /**
33
+ * @since 1.0.0
34
+ * @category Models
35
+ */
36
+ export type ChunkType =
37
+ | "function"
38
+ | "method"
39
+ | "class"
40
+ | "interface"
41
+ | "type-alias"
42
+ | "enum"
43
+ | "variable"
44
+
28
45
  /**
29
46
  * @since 1.0.0
30
47
  * @category Services
@@ -74,8 +91,6 @@ const sourceExtensions = new Set([
74
91
  "ini",
75
92
  "java",
76
93
  "js",
77
- "json",
78
- "jsonc",
79
94
  "jsx",
80
95
  "kt",
81
96
  "kts",
@@ -84,7 +99,6 @@ const sourceExtensions = new Set([
84
99
  "mjs",
85
100
  "mts",
86
101
  "php",
87
- "properties",
88
102
  "py",
89
103
  "rb",
90
104
  "rs",
@@ -95,13 +109,10 @@ const sourceExtensions = new Set([
95
109
  "sql",
96
110
  "svelte",
97
111
  "swift",
98
- "toml",
99
112
  "ts",
100
113
  "tsx",
101
114
  "vue",
102
115
  "xml",
103
- "yaml",
104
- "yml",
105
116
  "zsh",
106
117
  ])
107
118
 
@@ -114,29 +125,6 @@ const documentationExtensions = new Set([
114
125
  "txt",
115
126
  ])
116
127
 
117
- const allowedBareFileNames = new Set([
118
- ".editorconfig",
119
- ".gitignore",
120
- ".npmrc",
121
- ".nvmrc",
122
- "dockerfile",
123
- "justfile",
124
- "license",
125
- "makefile",
126
- "readme",
127
- ])
128
-
129
- const ignoredFileNames = new Set([
130
- "bun.lock",
131
- "bun.lockb",
132
- "cargo.lock",
133
- "composer.lock",
134
- "package-lock.json",
135
- "pnpm-lock.yaml",
136
- "poetry.lock",
137
- "yarn.lock",
138
- ])
139
-
140
128
  const ignoredDirectories = new Set([
141
129
  ".git",
142
130
  ".next",
@@ -155,14 +143,36 @@ const normalizePath = (path: string): string => path.replace(/\\/g, "/")
155
143
  const normalizeText = (content: string): string =>
156
144
  content.replace(/\r\n/g, "\n").replace(/\r/g, "\n")
157
145
 
158
- const hashContent = (content: string): string =>
159
- createHash("sha256").update(content).digest("hex")
160
-
161
146
  const meaningfulLinePattern = /[^\s\p{P}]/u
162
147
 
163
148
  const isMeaningfulLine = (line: string): boolean =>
164
149
  meaningfulLinePattern.test(line)
165
150
 
151
+ interface LineRange {
152
+ readonly startLine: number
153
+ readonly endLine: number
154
+ }
155
+
156
+ interface ChunkSettings {
157
+ readonly chunkSize: number
158
+ readonly chunkOverlap: number
159
+ }
160
+
161
+ interface ChunkRange extends LineRange {
162
+ readonly name: string | undefined
163
+ readonly type: ChunkType | undefined
164
+ readonly parent: string | undefined
165
+ }
166
+
167
+ const languageByExtension = new Map<string, unknown>([
168
+ ["js", TreeSitterJavaScript],
169
+ ["jsx", TreeSitterJavaScript],
170
+ ["ts", TreeSitterTypeScript.typescript],
171
+ ["tsx", TreeSitterTypeScript.tsx],
172
+ ])
173
+
174
+ // const ignoredTopLevelNodeTypes = new Set(["comment", "import_statement"])
175
+
166
176
  /**
167
177
  * @since 1.0.0
168
178
  * @category Predicates
@@ -205,22 +215,10 @@ export const isMeaningfulFile = (path: string): boolean => {
205
215
  return false
206
216
  }
207
217
 
208
- if (ignoredFileNames.has(fileName)) {
209
- return false
210
- }
211
-
212
218
  if (/\.min\.(?:css|js)$/i.test(fileName)) {
213
219
  return false
214
220
  }
215
221
 
216
- if (fileName.endsWith(".map")) {
217
- return false
218
- }
219
-
220
- if (allowedBareFileNames.has(fileName)) {
221
- return true
222
- }
223
-
224
222
  const extensionIndex = fileName.lastIndexOf(".")
225
223
  if (extensionIndex === -1) {
226
224
  return false
@@ -235,7 +233,7 @@ export const isMeaningfulFile = (path: string): boolean => {
235
233
  const resolveChunkSettings = (options: {
236
234
  readonly chunkSize: number
237
235
  readonly chunkOverlap: number
238
- }) => {
236
+ }): ChunkSettings => {
239
237
  const chunkSize = Math.max(1, options.chunkSize)
240
238
  const chunkOverlap = Math.max(
241
239
  0,
@@ -248,33 +246,420 @@ const resolveChunkSettings = (options: {
248
246
  }
249
247
  }
250
248
 
251
- /**
252
- * @since 1.0.0
253
- * @category Constructors
254
- */
255
- export const chunkFileContent = (
249
+ const getPathExtension = (path: string): string | undefined => {
250
+ const fileName = path.split("/").at(-1)
251
+ if (fileName === undefined) {
252
+ return undefined
253
+ }
254
+
255
+ const extensionIndex = fileName.lastIndexOf(".")
256
+ if (extensionIndex === -1) {
257
+ return undefined
258
+ }
259
+
260
+ return fileName.slice(extensionIndex + 1).toLowerCase()
261
+ }
262
+
263
+ const resolveAstLanguage = (path: string): unknown => {
264
+ const extension = getPathExtension(path)
265
+ if (extension === undefined) {
266
+ return undefined
267
+ }
268
+
269
+ return languageByExtension.get(extension)
270
+ }
271
+
272
+ const lineRangeFromNode = (node: SyntaxNode): LineRange => {
273
+ const startLine = node.startPosition.row + 1
274
+ const endLine = Math.max(startLine, node.endPosition.row + 1)
275
+ return {
276
+ startLine,
277
+ endLine,
278
+ }
279
+ }
280
+
281
+ const hasOnlyWhitespaceLines = (
282
+ lines: ReadonlyArray<string>,
283
+ startLine: number,
284
+ endLine: number,
285
+ ): boolean => {
286
+ if (startLine > endLine) {
287
+ return true
288
+ }
289
+
290
+ for (let lineIndex = startLine; lineIndex <= endLine; lineIndex++) {
291
+ if ((lines[lineIndex - 1] ?? "").trim().length > 0) {
292
+ return false
293
+ }
294
+ }
295
+
296
+ return true
297
+ }
298
+
299
+ const lineRangeWithLeadingComments = (
300
+ node: SyntaxNode,
301
+ siblings: ReadonlyArray<SyntaxNode>,
302
+ nodeIndex: number,
303
+ lines: ReadonlyArray<string>,
304
+ ): LineRange => {
305
+ const baseRange = lineRangeFromNode(node)
306
+ let startLine = baseRange.startLine
307
+
308
+ for (let index = nodeIndex - 1; index >= 0; index--) {
309
+ const sibling = siblings[index]!
310
+ if (sibling.type !== "comment") {
311
+ break
312
+ }
313
+
314
+ const commentRange = lineRangeFromNode(sibling)
315
+ if (
316
+ !hasOnlyWhitespaceLines(lines, commentRange.endLine + 1, startLine - 1)
317
+ ) {
318
+ break
319
+ }
320
+
321
+ startLine = commentRange.startLine
322
+ }
323
+
324
+ return {
325
+ startLine,
326
+ endLine: baseRange.endLine,
327
+ }
328
+ }
329
+
330
+ const normalizeLineRange = (
331
+ range: LineRange,
332
+ lineCount: number,
333
+ ): LineRange | undefined => {
334
+ const startLine = Math.max(1, Math.min(lineCount, range.startLine))
335
+ const endLine = Math.max(1, Math.min(lineCount, range.endLine))
336
+
337
+ if (endLine < startLine) {
338
+ return undefined
339
+ }
340
+
341
+ return {
342
+ startLine,
343
+ endLine,
344
+ }
345
+ }
346
+
347
+ const splitRange = (
348
+ range: LineRange,
349
+ settings: ChunkSettings,
350
+ ): ReadonlyArray<LineRange> => {
351
+ const lineCount = range.endLine - range.startLine + 1
352
+ if (lineCount <= settings.chunkSize) {
353
+ return [range]
354
+ }
355
+
356
+ const step = settings.chunkSize - settings.chunkOverlap
357
+ const out = [] as Array<LineRange>
358
+
359
+ for (
360
+ let startLine = range.startLine;
361
+ startLine <= range.endLine;
362
+ startLine += step
363
+ ) {
364
+ const endLine = Math.min(range.endLine, startLine + settings.chunkSize - 1)
365
+ out.push({
366
+ startLine,
367
+ endLine,
368
+ })
369
+
370
+ if (endLine >= range.endLine) {
371
+ break
372
+ }
373
+ }
374
+
375
+ return out
376
+ }
377
+
378
+ const nodeText = (node: SyntaxNode | null): string | undefined => {
379
+ if (node === null) {
380
+ return undefined
381
+ }
382
+
383
+ const value = node.text.trim().replace(/\s+/g, " ")
384
+ return value.length === 0 ? undefined : value
385
+ }
386
+
387
+ const nodeFieldText = (
388
+ node: SyntaxNode,
389
+ fieldName: string,
390
+ ): string | undefined => nodeText(node.childForFieldName(fieldName))
391
+
392
+ const unwrapExportNode = (node: SyntaxNode): SyntaxNode => {
393
+ if (node.type !== "export_statement") {
394
+ return node
395
+ }
396
+
397
+ return node.childForFieldName("declaration") ?? node
398
+ }
399
+
400
+ const variableDeclarators = (node: SyntaxNode): ReadonlyArray<SyntaxNode> =>
401
+ node.namedChildren.filter((child) => child.type === "variable_declarator")
402
+
403
+ const variableTypeFromDeclarator = (node: SyntaxNode): ChunkType => {
404
+ const value = node.childForFieldName("value")
405
+ if (value !== null && value.type.includes("function")) {
406
+ return "function"
407
+ }
408
+ return "variable"
409
+ }
410
+
411
+ const variableTypeFromDeclaration = (node: SyntaxNode): ChunkType => {
412
+ const declarators = variableDeclarators(node)
413
+ if (
414
+ declarators.some(
415
+ (declarator) => variableTypeFromDeclarator(declarator) === "function",
416
+ )
417
+ ) {
418
+ return "function"
419
+ }
420
+ return "variable"
421
+ }
422
+
423
+ const chunkTypeFromNode = (node: SyntaxNode): ChunkType | undefined => {
424
+ switch (node.type) {
425
+ case "class_declaration":
426
+ return "class"
427
+ case "enum_declaration":
428
+ return "enum"
429
+ case "function_declaration":
430
+ case "generator_function_declaration":
431
+ return "function"
432
+ case "interface_declaration":
433
+ return "interface"
434
+ case "generator_method_definition":
435
+ case "method_definition":
436
+ return "method"
437
+ case "type_alias_declaration":
438
+ return "type-alias"
439
+ case "lexical_declaration":
440
+ case "variable_declaration":
441
+ return variableTypeFromDeclaration(node)
442
+ case "variable_declarator":
443
+ return variableTypeFromDeclarator(node)
444
+ default:
445
+ return undefined
446
+ }
447
+ }
448
+
449
+ const variableNamesFromDeclaration = (node: SyntaxNode): string | undefined => {
450
+ const names = variableDeclarators(node)
451
+ .map((declarator) => nodeFieldText(declarator, "name"))
452
+ .filter((name): name is string => name !== undefined)
453
+
454
+ if (names.length === 0) {
455
+ return undefined
456
+ }
457
+
458
+ return names.join(", ")
459
+ }
460
+
461
+ const nameFromNode = (node: SyntaxNode): string | undefined => {
462
+ switch (node.type) {
463
+ case "class_declaration":
464
+ case "enum_declaration":
465
+ case "function_declaration":
466
+ case "generator_function_declaration":
467
+ case "interface_declaration":
468
+ case "generator_method_definition":
469
+ case "method_definition":
470
+ case "type_alias_declaration":
471
+ case "variable_declarator":
472
+ return nodeFieldText(node, "name")
473
+ case "lexical_declaration":
474
+ case "variable_declaration":
475
+ return variableNamesFromDeclaration(node)
476
+ default:
477
+ return undefined
478
+ }
479
+ }
480
+
481
+ const formatParent = (
482
+ type: ChunkType | undefined,
483
+ name: string | undefined,
484
+ ): string | undefined => {
485
+ if (type === undefined && name === undefined) {
486
+ return undefined
487
+ }
488
+ if (type === undefined) {
489
+ return name
490
+ }
491
+ if (name === undefined) {
492
+ return type
493
+ }
494
+ return type + " " + name
495
+ }
496
+
497
+ const collectClassMethodRanges = (
498
+ classNode: SyntaxNode,
499
+ parent: string | undefined,
500
+ lines: ReadonlyArray<string>,
501
+ ): ReadonlyArray<ChunkRange> => {
502
+ const body = classNode.childForFieldName("body")
503
+ if (body === null) {
504
+ return []
505
+ }
506
+
507
+ const out = [] as Array<ChunkRange>
508
+ for (let index = 0; index < body.namedChildren.length; index++) {
509
+ const child = body.namedChildren[index]!
510
+ if (!child.type.includes("method")) {
511
+ continue
512
+ }
513
+
514
+ out.push({
515
+ ...lineRangeWithLeadingComments(child, body.namedChildren, index, lines),
516
+ name: nameFromNode(child),
517
+ type: chunkTypeFromNode(child),
518
+ parent,
519
+ })
520
+ }
521
+
522
+ return out
523
+ }
524
+
525
+ const collectAstRanges = (
256
526
  path: string,
257
527
  content: string,
258
- options: {
259
- readonly chunkSize: number
260
- readonly chunkOverlap: number
261
- },
262
- ): ReadonlyArray<CodeChunk> => {
263
- if (content.trim().length === 0 || isProbablyMinified(content)) {
528
+ lines: ReadonlyArray<string>,
529
+ ): ReadonlyArray<ChunkRange> => {
530
+ const language = resolveAstLanguage(path)
531
+ if (language === undefined) {
264
532
  return []
265
533
  }
266
534
 
267
- const normalizedPath = normalizePath(path)
268
- const normalizedContent = normalizeText(content)
269
- const lines = normalizedContent.split("\n")
270
- if (lines.at(-1) === "") {
271
- lines.pop()
272
- }
273
- if (lines.length === 0) {
535
+ try {
536
+ const parser = new TreeSitter()
537
+ parser.setLanguage(language)
538
+ const tree = parser.parse(content, undefined, {
539
+ bufferSize: 1024 * 1024,
540
+ })
541
+ const out = [] as Array<ChunkRange>
542
+
543
+ const topLevelNodes = tree.rootNode.namedChildren
544
+ for (let index = 0; index < topLevelNodes.length; index++) {
545
+ const topLevelNode = topLevelNodes[index]!
546
+ if (
547
+ topLevelNode.type === "comment" ||
548
+ topLevelNode.type.includes("import")
549
+ ) {
550
+ continue
551
+ }
552
+
553
+ const declarationNode = unwrapExportNode(topLevelNode)
554
+ const type = chunkTypeFromNode(declarationNode)
555
+ const name = nameFromNode(declarationNode)
556
+
557
+ out.push({
558
+ ...lineRangeWithLeadingComments(
559
+ topLevelNode,
560
+ topLevelNodes,
561
+ index,
562
+ lines,
563
+ ),
564
+ name,
565
+ type,
566
+ parent: undefined,
567
+ })
568
+
569
+ if (declarationNode.type === "class_declaration") {
570
+ const parent = formatParent(type, name)
571
+ out.push(...collectClassMethodRanges(declarationNode, parent, lines))
572
+ }
573
+ }
574
+
575
+ return out
576
+ } catch {
274
577
  return []
275
578
  }
579
+ }
580
+
581
+ const chunksFromRanges = (
582
+ path: string,
583
+ lines: ReadonlyArray<string>,
584
+ ranges: ReadonlyArray<ChunkRange>,
585
+ settings: ChunkSettings,
586
+ ): ReadonlyArray<CodeChunk> => {
587
+ const hasMethodChildRange = (
588
+ classRange: LineRange & { readonly name: string | undefined },
589
+ ) => {
590
+ const parent = formatParent("class", classRange.name)
591
+ return ranges.some(
592
+ (range) =>
593
+ range.type === "method" &&
594
+ range.parent === parent &&
595
+ range.startLine >= classRange.startLine &&
596
+ range.endLine <= classRange.endLine,
597
+ )
598
+ }
276
599
 
277
- const settings = resolveChunkSettings(options)
600
+ const out = [] as Array<CodeChunk>
601
+ const seen = new Set<string>()
602
+
603
+ for (const range of ranges) {
604
+ const normalizedRange = normalizeLineRange(range, lines.length)
605
+ if (normalizedRange === undefined) {
606
+ continue
607
+ }
608
+
609
+ const allSegments = splitRange(normalizedRange, settings)
610
+ const segments =
611
+ range.type === "class" &&
612
+ allSegments.length > 1 &&
613
+ hasMethodChildRange({ ...normalizedRange, name: range.name })
614
+ ? [allSegments[0]!]
615
+ : allSegments
616
+
617
+ for (const segment of segments) {
618
+ const key =
619
+ String(segment.startLine) +
620
+ ":" +
621
+ String(segment.endLine) +
622
+ ":" +
623
+ (range.name ?? "") +
624
+ ":" +
625
+ (range.type ?? "") +
626
+ ":" +
627
+ (range.parent ?? "")
628
+ if (seen.has(key)) {
629
+ continue
630
+ }
631
+ seen.add(key)
632
+
633
+ const chunkLines = lines.slice(segment.startLine - 1, segment.endLine)
634
+ if (!chunkLines.some(isMeaningfulLine)) {
635
+ continue
636
+ }
637
+
638
+ out.push({
639
+ path,
640
+ startLine: segment.startLine,
641
+ endLine: segment.endLine,
642
+ name: range.name,
643
+ type: range.type,
644
+ parent: range.parent,
645
+ content: chunkLines.join("\n"),
646
+ })
647
+ }
648
+ }
649
+
650
+ return out.toSorted(
651
+ (left, right) =>
652
+ left.startLine - right.startLine ||
653
+ left.endLine - right.endLine ||
654
+ (left.name ?? "").localeCompare(right.name ?? ""),
655
+ )
656
+ }
657
+
658
+ const chunkWithLineWindows = (
659
+ path: string,
660
+ lines: ReadonlyArray<string>,
661
+ settings: ChunkSettings,
662
+ ): ReadonlyArray<CodeChunk> => {
278
663
  const step = settings.chunkSize - settings.chunkOverlap
279
664
  const out = [] as Array<CodeChunk>
280
665
 
@@ -287,14 +672,15 @@ export const chunkFileContent = (
287
672
  const start = index
288
673
  const end = Math.min(lines.length, start + settings.chunkSize)
289
674
  const chunkLines = lines.slice(start, end)
290
- const chunkContent = chunkLines.join("\n")
291
675
 
292
676
  out.push({
293
- path: normalizedPath,
677
+ path,
294
678
  startLine: start + 1,
295
679
  endLine: end,
296
- contentHash: hashContent(chunkContent),
297
- content: chunkContent,
680
+ name: undefined,
681
+ type: undefined,
682
+ parent: undefined,
683
+ content: chunkLines.join("\n"),
298
684
  })
299
685
 
300
686
  index += step
@@ -307,6 +693,49 @@ export const chunkFileContent = (
307
693
  return out
308
694
  }
309
695
 
696
+ /**
697
+ * @since 1.0.0
698
+ * @category Constructors
699
+ */
700
+ export const chunkFileContent = (
701
+ path: string,
702
+ content: string,
703
+ options: {
704
+ readonly chunkSize: number
705
+ readonly chunkOverlap: number
706
+ },
707
+ ): ReadonlyArray<CodeChunk> => {
708
+ if (content.trim().length === 0 || isProbablyMinified(content)) {
709
+ return []
710
+ }
711
+
712
+ const normalizedPath = normalizePath(path)
713
+ const normalizedContent = normalizeText(content)
714
+ const lines = normalizedContent.split("\n")
715
+ if (lines.at(-1) === "") {
716
+ lines.pop()
717
+ }
718
+ if (lines.length === 0) {
719
+ return []
720
+ }
721
+
722
+ const settings = resolveChunkSettings(options)
723
+ const astRanges = collectAstRanges(normalizedPath, normalizedContent, lines)
724
+ if (astRanges.length > 0) {
725
+ const astChunks = chunksFromRanges(
726
+ normalizedPath,
727
+ lines,
728
+ astRanges,
729
+ settings,
730
+ )
731
+ if (astChunks.length > 0) {
732
+ return astChunks
733
+ }
734
+ }
735
+
736
+ return chunkWithLineWindows(normalizedPath, lines, settings)
737
+ }
738
+
310
739
  /**
311
740
  * @since 1.0.0
312
741
  * @category Layers