@matperez/coderag 0.1.26 → 0.1.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/code-tokenizer.d.ts
CHANGED
|
@@ -30,7 +30,9 @@ export declare class CodeTokenizer {
|
|
|
30
30
|
initialize(): Promise<void>;
|
|
31
31
|
private doInitialize;
|
|
32
32
|
/**
|
|
33
|
-
* Tokenize code into terms for TF-IDF indexing
|
|
33
|
+
* Tokenize code into terms for TF-IDF indexing.
|
|
34
|
+
* Uses StarCoder2 when available; falls back to simple word-split when it returns no tokens
|
|
35
|
+
* (e.g. for Go, proto, YAML) so search still works.
|
|
34
36
|
*/
|
|
35
37
|
tokenize(code: string): Promise<string[]>;
|
|
36
38
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"code-tokenizer.d.ts","sourceRoot":"","sources":["../src/code-tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;
|
|
1
|
+
{"version":3,"file":"code-tokenizer.d.ts","sourceRoot":"","sources":["../src/code-tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAeH,MAAM,WAAW,SAAS;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;IACrB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAA;CACnB;AAED,MAAM,WAAW,gBAAgB;IAChC,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAA;IAC3B,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAA;CAC1B;AAED;;;;;GAKG;AACH,qBAAa,aAAa;IACzB,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,WAAW,CAAQ;IAC3B,OAAO,CAAC,WAAW,CAA6B;IAChD,OAAO,CAAC,SAAS,CAAQ;gBAEb,OAAO,GAAE,gBAAqB;IAK1C;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;YAcnB,YAAY;IAiB1B;;;;OAIG;IACG,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAiC/C;;OAEG;IACG,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAW9D;;OAEG;IACH,OAAO,IAAI,OAAO;CAGlB;AAKD;;GAEG;AACH,wBAAgB,YAAY,IAAI,aAAa,CAK5C;AAED;;;GAGG;AACH,wBAAsB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAG9D;AAED;;GAEG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAG7E;AAED;;GAEG;AACH,wBAAsB,mBAAmB,IAAI,OAAO,CAAC,IAAI,CAAC,CAGzD"}
|
package/dist/code-tokenizer.js
CHANGED
|
@@ -5,6 +5,16 @@
|
|
|
5
5
|
* world-class code tokenization quality without requiring the full model.
|
|
6
6
|
*/
|
|
7
7
|
import { AutoTokenizer } from '@huggingface/transformers';
|
|
8
|
+
/**
|
|
9
|
+
* Simple word/identifier tokenizer fallback when StarCoder2 returns no tokens.
|
|
10
|
+
* Splits on non-alphanumeric and underscore, lowercases, keeps tokens with length > 1.
|
|
11
|
+
*/
|
|
12
|
+
function simpleWordTokenize(text) {
|
|
13
|
+
return text
|
|
14
|
+
.split(/[^a-zA-Z0-9_]+/)
|
|
15
|
+
.filter((w) => w.length > 1)
|
|
16
|
+
.map((w) => w.toLowerCase());
|
|
17
|
+
}
|
|
8
18
|
/**
|
|
9
19
|
* StarCoder2 Code Tokenizer
|
|
10
20
|
*
|
|
@@ -49,30 +59,40 @@ export class CodeTokenizer {
|
|
|
49
59
|
}
|
|
50
60
|
}
|
|
51
61
|
/**
|
|
52
|
-
* Tokenize code into terms for TF-IDF indexing
|
|
62
|
+
* Tokenize code into terms for TF-IDF indexing.
|
|
63
|
+
* Uses StarCoder2 when available; falls back to simple word-split when it returns no tokens
|
|
64
|
+
* (e.g. for Go, proto, YAML) so search still works.
|
|
53
65
|
*/
|
|
54
66
|
async tokenize(code) {
|
|
55
|
-
if (!this.initialized) {
|
|
56
|
-
await this.initialize();
|
|
57
|
-
}
|
|
58
67
|
if (!code || code.trim().length === 0) {
|
|
59
68
|
return [];
|
|
60
69
|
}
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
let tokens = [];
|
|
71
|
+
try {
|
|
72
|
+
if (!this.initialized) {
|
|
73
|
+
await this.initialize();
|
|
74
|
+
}
|
|
75
|
+
// Encode with StarCoder2
|
|
76
|
+
const encoded = await this.tokenizer(code);
|
|
77
|
+
const inputIds = encoded.input_ids.tolist()[0];
|
|
78
|
+
for (const id of inputIds) {
|
|
79
|
+
const token = await this.tokenizer.decode([id], {
|
|
80
|
+
skip_special_tokens: true,
|
|
81
|
+
});
|
|
82
|
+
const cleaned = token.trim().toLowerCase();
|
|
83
|
+
if (cleaned.length > 1) {
|
|
84
|
+
tokens.push(cleaned);
|
|
85
|
+
}
|
|
74
86
|
}
|
|
75
87
|
}
|
|
88
|
+
catch (e) {
|
|
89
|
+
// StarCoder2 failed (e.g. OOM, unsupported input)
|
|
90
|
+
console.error('[tokenize] StarCoder2 failed, using fallback:', e.message);
|
|
91
|
+
}
|
|
92
|
+
// Fallback: simple word/identifier split when StarCoder2 returns no tokens (e.g. large chunk, env issue)
|
|
93
|
+
if (tokens.length === 0) {
|
|
94
|
+
tokens = simpleWordTokenize(code);
|
|
95
|
+
}
|
|
76
96
|
return tokens;
|
|
77
97
|
}
|
|
78
98
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"code-tokenizer.js","sourceRoot":"","sources":["../src/code-tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAA;
|
|
1
|
+
{"version":3,"file":"code-tokenizer.js","sourceRoot":"","sources":["../src/code-tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAA;AAEzD;;;GAGG;AACH,SAAS,kBAAkB,CAAC,IAAY;IACvC,OAAO,IAAI;SACT,KAAK,CAAC,gBAAgB,CAAC;SACvB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;SAC3B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAA;AAC9B,CAAC;AAYD;;;;;GAKG;AACH,MAAM,OAAO,aAAa;IACjB,SAAS,CAAK;IACd,WAAW,GAAG,KAAK,CAAA;IACnB,WAAW,GAAyB,IAAI,CAAA;IACxC,SAAS,CAAQ;IAEzB,YAAY,UAA4B,EAAE;QACzC,8EAA8E;QAC9E,IAAI,CAAC,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,wBAAwB,CAAA;IAC/D,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU;QACf,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACtB,OAAM;QACP,CAAC;QAED,8CAA8C;QAC9C,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACtB,OAAO,IAAI,CAAC,WAAW,CAAA;QACxB,CAAC;QAED,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,YAAY,EAAE,CAAA;QACtC,OAAO,IAAI,CAAC,WAAW,CAAA;IACxB,CAAC;IAEO,KAAK,CAAC,YAAY;QACzB,IAAI,CAAC;YACJ,OAAO,CAAC,KAAK,CAAC,mEAAmE,CAAC,CAAA;YAClF,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAA;YAE5B,IAAI,CAAC,SAAS,GAAG,MAAM,aAAa,CAAC,eAAe,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;YAEpE,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAA;YACvC,OAAO,CAAC,KAAK,CAAC,iCAAiC,QAAQ,IAAI,CAAC,CAAA;YAE5D,IAAI,CAAC,WAAW,GAAG,IAAI,CAAA;QACxB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,IAAI,CAAC,WAAW,GAAG,IAAI,CAAA;YACvB,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,CAAC,OAAO,EAAE,CAAC,CAAA;QAC9D,CAAC;IACF,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAY;QAC1B,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvC,OAAO,EAAE,CAAA;QACV,CAAC;QAED,IAAI,MAAM,GAAa,EAAE,CAAA;QACzB,IAAI,CAAC;YACJ,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;gBACvB,MAAM,IAAI,CAAC,UAAU,EAAE,CAAA;YACxB,CAAC;YACD,yBAAyB;YACzB,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;YAC1C,MAAM,QAAQ,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAA;YAC9C,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;gBAC3B,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE;oBAC/C,mBAAmB,EAAE,IAAI;iBACzB,CAAC,CAAA;gBACF,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;gBAC1C,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACxB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;gBACrB,CAAC;YACF,CAAC;QACF,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACZ,kDAAkD;YAClD,OAAO,CAAC,KAAK,CAAC,+CAA+C,EAAG,CAAW,CAAC,OAAO,CAAC,CAAA;QACrF,CAAC;QACD,yGAAyG;QACzG,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACzB,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QAClC,CAAC;QACD,OAAO,MAAM,CAAA;IACd,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,YAAY,CAAC,IAAY;QAC9B,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAA;QACxC,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAA;QAE1C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC5B,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;QACpD,CAAC;QAED,OAAO,QAAQ,CAAA;IAChB,CAAC;IAED;;OAEG;IACH,OAAO;QACN,OAAO,IAAI,CAAC,WAAW,CAAA;IACxB,CAAC;CACD;AAED,oCAAoC;AACpC,IAAI,eAAe,GAAyB,IAAI,CAAA;AAEhD;;GAEG;AACH,MAAM,UAAU,YAAY;IAC3B,IAAI,CAAC,eAAe,EAAE,CAAC;QACtB,eAAe,GAAG,IAAI,aAAa,EAAE,CAAA;IACtC,CAAC;IACD,OAAO,eAAe,CAAA;AACvB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,IAAY;IAC1C,MAAM,SAAS,GAAG,YAAY,EAAE,CAAA;IAChC,OAAO,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAA;AAChC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,IAAY;IAC9C,MAAM,SAAS,GAAG,YAAY,EAAE,CAAA;IAChC,OAAO,SAAS,CAAC,YAAY,CAAC,IAAI,CAAC,CAAA;AACpC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB;IACxC,MAAM,SAAS,GAAG,YAAY,EAAE,CAAA;IAChC,MAAM,SAAS,CAAC,UAAU,EAAE,CAAA;AAC7B,CAAC"}
|