@craftpipe/contextpack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.contextpackrc.example.json +167 -0
- package/.env.example +5 -0
- package/.github/ISSUE_TEMPLATE/bug_report.md +26 -0
- package/.github/ISSUE_TEMPLATE/feature_request.md +15 -0
- package/.github/pull_request_template.md +9 -0
- package/CODE_OF_CONDUCT.md +40 -0
- package/CONTRIBUTING.md +59 -0
- package/LICENSE +21 -0
- package/README.md +100 -0
- package/SECURITY.md +21 -0
- package/index.js +428 -0
- package/lib/analyzer.js +547 -0
- package/lib/bundler.js +477 -0
- package/lib/config.js +269 -0
- package/lib/license.js +180 -0
- package/lib/premium/config-file.js +917 -0
- package/lib/premium/gate.js +13 -0
- package/lib/premium/html-report.js +1094 -0
- package/lib/premium/index.js +57 -0
- package/lib/premium/watch-mode.js +627 -0
- package/lib/scanner.js +480 -0
- package/lib/tokenizer.js +291 -0
- package/lib/validator.js +561 -0
- package/package.json +12 -0
- package/tests/analyzer.test.mjs +128 -0
- package/tests/bundler.test.mjs +126 -0
- package/tests/config.test.mjs +103 -0
- package/tests/gate.test.mjs +118 -0
- package/tests/index.test.mjs +103 -0
- package/tests/license.test.mjs +97 -0
- package/tests/scanner.test.mjs +110 -0
- package/tests/tokenizer.test.mjs +103 -0
- package/tests/validator.test.mjs +111 -0
- package/vitest.config.mjs +13 -0
package/lib/tokenizer.js
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* lib/tokenizer.js
|
|
5
|
+
* Estimates token count using character-based approximation or tiktoken-js;
|
|
6
|
+
* calculates total bundle token size; estimates token savings compared to raw file inclusion
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Average characters per token for GPT-style models (empirically ~4 chars/token)
|
|
11
|
+
*/
|
|
12
|
+
const CHARS_PER_TOKEN = 4;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Overhead ratio for structured formats (JSON/Markdown adds keys, brackets, etc.)
|
|
16
|
+
*/
|
|
17
|
+
const JSON_OVERHEAD_RATIO = 1.15;
|
|
18
|
+
const MARKDOWN_OVERHEAD_RATIO = 1.08;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Try to load tiktoken-js for more accurate token estimation.
|
|
22
|
+
* Falls back to character-based approximation if unavailable.
|
|
23
|
+
*/
|
|
24
|
+
let tiktokenEncoder = null;
|
|
25
|
+
|
|
26
|
+
(function tryLoadTiktoken() {
|
|
27
|
+
try {
|
|
28
|
+
// Attempt to load tiktoken or tiktoken-lite if available
|
|
29
|
+
const tiktoken = require('tiktoken');
|
|
30
|
+
if (tiktoken && typeof tiktoken.get_encoding === 'function') {
|
|
31
|
+
tiktokenEncoder = tiktoken.get_encoding('cl100k_base');
|
|
32
|
+
}
|
|
33
|
+
} catch (_e1) {
|
|
34
|
+
try {
|
|
35
|
+
const tiktoken = require('@dqbd/tiktoken');
|
|
36
|
+
if (tiktoken && typeof tiktoken.get_encoding === 'function') {
|
|
37
|
+
tiktokenEncoder = tiktoken.get_encoding('cl100k_base');
|
|
38
|
+
}
|
|
39
|
+
} catch (_e2) {
|
|
40
|
+
// Neither tiktoken package is available; use character-based approximation
|
|
41
|
+
tiktokenEncoder = null;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
})();
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Estimate the number of tokens in a string using tiktoken if available,
|
|
48
|
+
* otherwise fall back to character-based approximation.
|
|
49
|
+
*
|
|
50
|
+
* @param {string} text - The text to estimate tokens for
|
|
51
|
+
* @param {object} [options] - Optional configuration
|
|
52
|
+
* @param {boolean} [options.forceCharApprox=false] - Force character-based approximation even if tiktoken is available
|
|
53
|
+
* @returns {number} Estimated token count (integer >= 0)
|
|
54
|
+
*/
|
|
55
|
+
function estimateTokens(text, options) {
|
|
56
|
+
const opts = options || {};
|
|
57
|
+
const { forceCharApprox = false } = opts;
|
|
58
|
+
|
|
59
|
+
if (text === null || text === undefined) {
|
|
60
|
+
return 0;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const str = typeof text === 'string' ? text : String(text);
|
|
64
|
+
|
|
65
|
+
if (str.length === 0) {
|
|
66
|
+
return 0;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Use tiktoken if available and not forced to use char approximation
|
|
70
|
+
if (!forceCharApprox && tiktokenEncoder !== null) {
|
|
71
|
+
try {
|
|
72
|
+
const encoded = tiktokenEncoder.encode(str);
|
|
73
|
+
const count = encoded ? encoded.length : 0;
|
|
74
|
+
// Free the encoded buffer if it has a free method (some tiktoken versions)
|
|
75
|
+
if (encoded && typeof encoded.free === 'function') {
|
|
76
|
+
encoded.free();
|
|
77
|
+
}
|
|
78
|
+
return count;
|
|
79
|
+
} catch (_err) {
|
|
80
|
+
// Fall through to character-based approximation
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Character-based approximation: ~4 characters per token
|
|
85
|
+
return Math.ceil(str.length / CHARS_PER_TOKEN);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Calculate the total token size of a bundle object.
|
|
90
|
+
* Accepts a bundle as produced by bundler.js (with fileSummaries, symbolIndex,
|
|
91
|
+
* dependencyMap, metadata) or any serializable object.
|
|
92
|
+
*
|
|
93
|
+
* @param {object} bundle - The bundle object to measure
|
|
94
|
+
* @param {object} [options] - Optional configuration
|
|
95
|
+
* @param {string} [options.format='json'] - Output format: 'json' or 'markdown'
|
|
96
|
+
* @param {boolean} [options.forceCharApprox=false] - Force character-based approximation
|
|
97
|
+
* @returns {object} Result with { totalTokens, breakdown, format }
|
|
98
|
+
*/
|
|
99
|
+
function calculateBundleSize(bundle, options) {
|
|
100
|
+
const opts = options || {};
|
|
101
|
+
const { format = 'json', forceCharApprox = false } = opts;
|
|
102
|
+
|
|
103
|
+
const emptyResult = {
|
|
104
|
+
totalTokens: 0,
|
|
105
|
+
breakdown: {
|
|
106
|
+
metadata: 0,
|
|
107
|
+
fileSummaries: 0,
|
|
108
|
+
symbolIndex: 0,
|
|
109
|
+
dependencyMap: 0,
|
|
110
|
+
},
|
|
111
|
+
format,
|
|
112
|
+
method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
if (bundle === null || bundle === undefined) {
|
|
116
|
+
return emptyResult;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (typeof bundle !== 'object') {
|
|
120
|
+
// Treat as raw string
|
|
121
|
+
const tokens = estimateTokens(String(bundle), { forceCharApprox });
|
|
122
|
+
return {
|
|
123
|
+
totalTokens: tokens,
|
|
124
|
+
breakdown: {
|
|
125
|
+
metadata: 0,
|
|
126
|
+
fileSummaries: 0,
|
|
127
|
+
symbolIndex: 0,
|
|
128
|
+
dependencyMap: 0,
|
|
129
|
+
raw: tokens,
|
|
130
|
+
},
|
|
131
|
+
format,
|
|
132
|
+
method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const overhead = format === 'markdown' ? MARKDOWN_OVERHEAD_RATIO : JSON_OVERHEAD_RATIO;
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Safely serialize a value to a string for token counting
|
|
140
|
+
* @param {*} val
|
|
141
|
+
* @returns {string}
|
|
142
|
+
*/
|
|
143
|
+
function safeSerialize(val) {
|
|
144
|
+
if (val === null || val === undefined) return '';
|
|
145
|
+
if (typeof val === 'string') return val;
|
|
146
|
+
try {
|
|
147
|
+
return JSON.stringify(val) || '';
|
|
148
|
+
} catch (_e) {
|
|
149
|
+
return String(val);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Break down token counts by bundle section
|
|
154
|
+
const metadataStr = safeSerialize(bundle.metadata);
|
|
155
|
+
const fileSummariesStr = safeSerialize(bundle.fileSummaries);
|
|
156
|
+
const symbolIndexStr = safeSerialize(bundle.symbolIndex);
|
|
157
|
+
const dependencyMapStr = safeSerialize(bundle.dependencyMap);
|
|
158
|
+
|
|
159
|
+
const metadataTokens = Math.ceil(estimateTokens(metadataStr, { forceCharApprox }) * overhead);
|
|
160
|
+
const fileSummariesTokens = Math.ceil(estimateTokens(fileSummariesStr, { forceCharApprox }) * overhead);
|
|
161
|
+
const symbolIndexTokens = Math.ceil(estimateTokens(symbolIndexStr, { forceCharApprox }) * overhead);
|
|
162
|
+
const dependencyMapTokens = Math.ceil(estimateTokens(dependencyMapStr, { forceCharApprox }) * overhead);
|
|
163
|
+
|
|
164
|
+
// If the bundle has sections we recognize, use the breakdown
|
|
165
|
+
const hasKnownSections = (
|
|
166
|
+
bundle.metadata !== undefined ||
|
|
167
|
+
bundle.fileSummaries !== undefined ||
|
|
168
|
+
bundle.symbolIndex !== undefined ||
|
|
169
|
+
bundle.dependencyMap !== undefined
|
|
170
|
+
);
|
|
171
|
+
|
|
172
|
+
let totalTokens;
|
|
173
|
+
let breakdown;
|
|
174
|
+
|
|
175
|
+
if (hasKnownSections) {
|
|
176
|
+
breakdown = {
|
|
177
|
+
metadata: metadataTokens,
|
|
178
|
+
fileSummaries: fileSummariesTokens,
|
|
179
|
+
symbolIndex: symbolIndexTokens,
|
|
180
|
+
dependencyMap: dependencyMapTokens,
|
|
181
|
+
};
|
|
182
|
+
totalTokens = metadataTokens + fileSummariesTokens + symbolIndexTokens + dependencyMapTokens;
|
|
183
|
+
} else {
|
|
184
|
+
// Unknown bundle shape — serialize the whole thing
|
|
185
|
+
const wholeStr = safeSerialize(bundle);
|
|
186
|
+
totalTokens = Math.ceil(estimateTokens(wholeStr, { forceCharApprox }) * overhead);
|
|
187
|
+
breakdown = {
|
|
188
|
+
metadata: 0,
|
|
189
|
+
fileSummaries: 0,
|
|
190
|
+
symbolIndex: 0,
|
|
191
|
+
dependencyMap: 0,
|
|
192
|
+
total: totalTokens,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
totalTokens,
|
|
198
|
+
breakdown,
|
|
199
|
+
format,
|
|
200
|
+
method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Estimate token savings achieved by using a bundle compared to including
|
|
206
|
+
* raw file contents directly.
|
|
207
|
+
*
|
|
208
|
+
* @param {object} bundle - The context bundle produced by bundler.js
|
|
209
|
+
* @param {object} [options] - Optional configuration
|
|
210
|
+
* @param {Array<object>} [options.rawFiles] - Array of raw file objects with { content, size } or { path, size }
|
|
211
|
+
* @param {number} [options.rawTokenCount] - Pre-computed raw token count (overrides rawFiles calculation)
|
|
212
|
+
* @param {string} [options.format='json'] - Bundle format for size calculation: 'json' or 'markdown'
|
|
213
|
+
* @param {boolean} [options.forceCharApprox=false] - Force character-based approximation
|
|
214
|
+
* @returns {object} Savings report with { bundleTokens, rawTokens, savedTokens, savingsPercent, method }
|
|
215
|
+
*/
|
|
216
|
+
function estimateSavings(bundle, options) {
|
|
217
|
+
const opts = options || {};
|
|
218
|
+
const {
|
|
219
|
+
rawFiles,
|
|
220
|
+
rawTokenCount,
|
|
221
|
+
format = 'json',
|
|
222
|
+
forceCharApprox = false,
|
|
223
|
+
} = opts;
|
|
224
|
+
|
|
225
|
+
const noSavingsResult = {
|
|
226
|
+
bundleTokens: 0,
|
|
227
|
+
rawTokens: 0,
|
|
228
|
+
savedTokens: 0,
|
|
229
|
+
savingsPercent: 0,
|
|
230
|
+
method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
if (bundle === null || bundle === undefined) {
|
|
234
|
+
return noSavingsResult;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Calculate bundle token size
|
|
238
|
+
const bundleSizeResult = calculateBundleSize(bundle, { format, forceCharApprox });
|
|
239
|
+
const bundleTokens = bundleSizeResult.totalTokens || 0;
|
|
240
|
+
|
|
241
|
+
// Calculate raw token count
|
|
242
|
+
let rawTokens = 0;
|
|
243
|
+
|
|
244
|
+
if (typeof rawTokenCount === 'number' && rawTokenCount >= 0) {
|
|
245
|
+
// Use pre-computed value if provided
|
|
246
|
+
rawTokens = rawTokenCount;
|
|
247
|
+
} else if (Array.isArray(rawFiles) && rawFiles.length > 0) {
|
|
248
|
+
// Sum tokens from each raw file
|
|
249
|
+
for (const file of rawFiles) {
|
|
250
|
+
if (!file || typeof file !== 'object') continue;
|
|
251
|
+
|
|
252
|
+
if (typeof file.content === 'string') {
|
|
253
|
+
// Use actual content if available
|
|
254
|
+
rawTokens += estimateTokens(file.content, { forceCharApprox });
|
|
255
|
+
} else if (typeof file.size === 'number' && file.size > 0) {
|
|
256
|
+
// Fall back to size-based approximation
|
|
257
|
+
rawTokens += Math.ceil(file.size / CHARS_PER_TOKEN);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
} else if (bundle.metadata && typeof bundle.metadata.totalSize === 'number') {
|
|
261
|
+
// Last resort: use bundle metadata totalSize to estimate raw tokens
|
|
262
|
+
rawTokens = Math.ceil(bundle.metadata.totalSize / CHARS_PER_TOKEN);
|
|
263
|
+
} else if (Array.isArray(bundle.fileSummaries) && bundle.fileSummaries.length > 0) {
|
|
264
|
+
// Estimate from file summaries if we have size info
|
|
265
|
+
for (const summary of bundle.fileSummaries) {
|
|
266
|
+
if (!summary || typeof summary !== 'object') continue;
|
|
267
|
+
if (typeof summary.size === 'number' && summary.size > 0) {
|
|
268
|
+
rawTokens += Math.ceil(summary.size / CHARS_PER_TOKEN);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
const savedTokens = Math.max(0, rawTokens - bundleTokens);
|
|
274
|
+
const savingsPercent = rawTokens > 0
|
|
275
|
+
? Math.round((savedTokens / rawTokens) * 10000) / 100
|
|
276
|
+
: 0;
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
bundleTokens,
|
|
280
|
+
rawTokens,
|
|
281
|
+
savedTokens,
|
|
282
|
+
savingsPercent,
|
|
283
|
+
method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
module.exports = {
|
|
288
|
+
estimateTokens,
|
|
289
|
+
calculateBundleSize,
|
|
290
|
+
estimateSavings,
|
|
291
|
+
};
|