@craftpipe/contextpack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,291 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * lib/tokenizer.js
5
+ * Estimates token count using character-based approximation or tiktoken-js;
6
+ * calculates total bundle token size; estimates token savings compared to raw file inclusion
7
+ */
8
+
9
+ /**
10
+ * Average characters per token for GPT-style models (empirically ~4 chars/token)
11
+ */
12
+ const CHARS_PER_TOKEN = 4;
13
+
14
+ /**
15
+ * Overhead ratio for structured formats (JSON/Markdown adds keys, brackets, etc.)
16
+ */
17
+ const JSON_OVERHEAD_RATIO = 1.15;
18
+ const MARKDOWN_OVERHEAD_RATIO = 1.08;
19
+
20
+ /**
21
+ * Try to load tiktoken-js for more accurate token estimation.
22
+ * Falls back to character-based approximation if unavailable.
23
+ */
24
+ let tiktokenEncoder = null;
25
+
26
+ (function tryLoadTiktoken() {
27
+ try {
28
+ // Attempt to load tiktoken or tiktoken-lite if available
29
+ const tiktoken = require('tiktoken');
30
+ if (tiktoken && typeof tiktoken.get_encoding === 'function') {
31
+ tiktokenEncoder = tiktoken.get_encoding('cl100k_base');
32
+ }
33
+ } catch (_e1) {
34
+ try {
35
+ const tiktoken = require('@dqbd/tiktoken');
36
+ if (tiktoken && typeof tiktoken.get_encoding === 'function') {
37
+ tiktokenEncoder = tiktoken.get_encoding('cl100k_base');
38
+ }
39
+ } catch (_e2) {
40
+ // Neither tiktoken package is available; use character-based approximation
41
+ tiktokenEncoder = null;
42
+ }
43
+ }
44
+ })();
45
+
46
+ /**
47
+ * Estimate the number of tokens in a string using tiktoken if available,
48
+ * otherwise fall back to character-based approximation.
49
+ *
50
+ * @param {string} text - The text to estimate tokens for
51
+ * @param {object} [options] - Optional configuration
52
+ * @param {boolean} [options.forceCharApprox=false] - Force character-based approximation even if tiktoken is available
53
+ * @returns {number} Estimated token count (integer >= 0)
54
+ */
55
+ function estimateTokens(text, options) {
56
+ const opts = options || {};
57
+ const { forceCharApprox = false } = opts;
58
+
59
+ if (text === null || text === undefined) {
60
+ return 0;
61
+ }
62
+
63
+ const str = typeof text === 'string' ? text : String(text);
64
+
65
+ if (str.length === 0) {
66
+ return 0;
67
+ }
68
+
69
+ // Use tiktoken if available and not forced to use char approximation
70
+ if (!forceCharApprox && tiktokenEncoder !== null) {
71
+ try {
72
+ const encoded = tiktokenEncoder.encode(str);
73
+ const count = encoded ? encoded.length : 0;
74
+ // Free the encoded buffer if it has a free method (some tiktoken versions)
75
+ if (encoded && typeof encoded.free === 'function') {
76
+ encoded.free();
77
+ }
78
+ return count;
79
+ } catch (_err) {
80
+ // Fall through to character-based approximation
81
+ }
82
+ }
83
+
84
+ // Character-based approximation: ~4 characters per token
85
+ return Math.ceil(str.length / CHARS_PER_TOKEN);
86
+ }
87
+
88
+ /**
89
+ * Calculate the total token size of a bundle object.
90
+ * Accepts a bundle as produced by bundler.js (with fileSummaries, symbolIndex,
91
+ * dependencyMap, metadata) or any serializable object.
92
+ *
93
+ * @param {object} bundle - The bundle object to measure
94
+ * @param {object} [options] - Optional configuration
95
+ * @param {string} [options.format='json'] - Output format: 'json' or 'markdown'
96
+ * @param {boolean} [options.forceCharApprox=false] - Force character-based approximation
97
+ * @returns {object} Result with { totalTokens, breakdown, format }
98
+ */
99
+ function calculateBundleSize(bundle, options) {
100
+ const opts = options || {};
101
+ const { format = 'json', forceCharApprox = false } = opts;
102
+
103
+ const emptyResult = {
104
+ totalTokens: 0,
105
+ breakdown: {
106
+ metadata: 0,
107
+ fileSummaries: 0,
108
+ symbolIndex: 0,
109
+ dependencyMap: 0,
110
+ },
111
+ format,
112
+ method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
113
+ };
114
+
115
+ if (bundle === null || bundle === undefined) {
116
+ return emptyResult;
117
+ }
118
+
119
+ if (typeof bundle !== 'object') {
120
+ // Treat as raw string
121
+ const tokens = estimateTokens(String(bundle), { forceCharApprox });
122
+ return {
123
+ totalTokens: tokens,
124
+ breakdown: {
125
+ metadata: 0,
126
+ fileSummaries: 0,
127
+ symbolIndex: 0,
128
+ dependencyMap: 0,
129
+ raw: tokens,
130
+ },
131
+ format,
132
+ method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
133
+ };
134
+ }
135
+
136
+ const overhead = format === 'markdown' ? MARKDOWN_OVERHEAD_RATIO : JSON_OVERHEAD_RATIO;
137
+
138
+ /**
139
+ * Safely serialize a value to a string for token counting
140
+ * @param {*} val
141
+ * @returns {string}
142
+ */
143
+ function safeSerialize(val) {
144
+ if (val === null || val === undefined) return '';
145
+ if (typeof val === 'string') return val;
146
+ try {
147
+ return JSON.stringify(val) || '';
148
+ } catch (_e) {
149
+ return String(val);
150
+ }
151
+ }
152
+
153
+ // Break down token counts by bundle section
154
+ const metadataStr = safeSerialize(bundle.metadata);
155
+ const fileSummariesStr = safeSerialize(bundle.fileSummaries);
156
+ const symbolIndexStr = safeSerialize(bundle.symbolIndex);
157
+ const dependencyMapStr = safeSerialize(bundle.dependencyMap);
158
+
159
+ const metadataTokens = Math.ceil(estimateTokens(metadataStr, { forceCharApprox }) * overhead);
160
+ const fileSummariesTokens = Math.ceil(estimateTokens(fileSummariesStr, { forceCharApprox }) * overhead);
161
+ const symbolIndexTokens = Math.ceil(estimateTokens(symbolIndexStr, { forceCharApprox }) * overhead);
162
+ const dependencyMapTokens = Math.ceil(estimateTokens(dependencyMapStr, { forceCharApprox }) * overhead);
163
+
164
+ // If the bundle has sections we recognize, use the breakdown
165
+ const hasKnownSections = (
166
+ bundle.metadata !== undefined ||
167
+ bundle.fileSummaries !== undefined ||
168
+ bundle.symbolIndex !== undefined ||
169
+ bundle.dependencyMap !== undefined
170
+ );
171
+
172
+ let totalTokens;
173
+ let breakdown;
174
+
175
+ if (hasKnownSections) {
176
+ breakdown = {
177
+ metadata: metadataTokens,
178
+ fileSummaries: fileSummariesTokens,
179
+ symbolIndex: symbolIndexTokens,
180
+ dependencyMap: dependencyMapTokens,
181
+ };
182
+ totalTokens = metadataTokens + fileSummariesTokens + symbolIndexTokens + dependencyMapTokens;
183
+ } else {
184
+ // Unknown bundle shape — serialize the whole thing
185
+ const wholeStr = safeSerialize(bundle);
186
+ totalTokens = Math.ceil(estimateTokens(wholeStr, { forceCharApprox }) * overhead);
187
+ breakdown = {
188
+ metadata: 0,
189
+ fileSummaries: 0,
190
+ symbolIndex: 0,
191
+ dependencyMap: 0,
192
+ total: totalTokens,
193
+ };
194
+ }
195
+
196
+ return {
197
+ totalTokens,
198
+ breakdown,
199
+ format,
200
+ method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
201
+ };
202
+ }
203
+
204
+ /**
205
+ * Estimate token savings achieved by using a bundle compared to including
206
+ * raw file contents directly.
207
+ *
208
+ * @param {object} bundle - The context bundle produced by bundler.js
209
+ * @param {object} [options] - Optional configuration
210
+ * @param {Array<object>} [options.rawFiles] - Array of raw file objects with { content, size } or { path, size }
211
+ * @param {number} [options.rawTokenCount] - Pre-computed raw token count (overrides rawFiles calculation)
212
+ * @param {string} [options.format='json'] - Bundle format for size calculation: 'json' or 'markdown'
213
+ * @param {boolean} [options.forceCharApprox=false] - Force character-based approximation
214
+ * @returns {object} Savings report with { bundleTokens, rawTokens, savedTokens, savingsPercent, method }
215
+ */
216
+ function estimateSavings(bundle, options) {
217
+ const opts = options || {};
218
+ const {
219
+ rawFiles,
220
+ rawTokenCount,
221
+ format = 'json',
222
+ forceCharApprox = false,
223
+ } = opts;
224
+
225
+ const noSavingsResult = {
226
+ bundleTokens: 0,
227
+ rawTokens: 0,
228
+ savedTokens: 0,
229
+ savingsPercent: 0,
230
+ method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
231
+ };
232
+
233
+ if (bundle === null || bundle === undefined) {
234
+ return noSavingsResult;
235
+ }
236
+
237
+ // Calculate bundle token size
238
+ const bundleSizeResult = calculateBundleSize(bundle, { format, forceCharApprox });
239
+ const bundleTokens = bundleSizeResult.totalTokens || 0;
240
+
241
+ // Calculate raw token count
242
+ let rawTokens = 0;
243
+
244
+ if (typeof rawTokenCount === 'number' && rawTokenCount >= 0) {
245
+ // Use pre-computed value if provided
246
+ rawTokens = rawTokenCount;
247
+ } else if (Array.isArray(rawFiles) && rawFiles.length > 0) {
248
+ // Sum tokens from each raw file
249
+ for (const file of rawFiles) {
250
+ if (!file || typeof file !== 'object') continue;
251
+
252
+ if (typeof file.content === 'string') {
253
+ // Use actual content if available
254
+ rawTokens += estimateTokens(file.content, { forceCharApprox });
255
+ } else if (typeof file.size === 'number' && file.size > 0) {
256
+ // Fall back to size-based approximation
257
+ rawTokens += Math.ceil(file.size / CHARS_PER_TOKEN);
258
+ }
259
+ }
260
+ } else if (bundle.metadata && typeof bundle.metadata.totalSize === 'number') {
261
+ // Last resort: use bundle metadata totalSize to estimate raw tokens
262
+ rawTokens = Math.ceil(bundle.metadata.totalSize / CHARS_PER_TOKEN);
263
+ } else if (Array.isArray(bundle.fileSummaries) && bundle.fileSummaries.length > 0) {
264
+ // Estimate from file summaries if we have size info
265
+ for (const summary of bundle.fileSummaries) {
266
+ if (!summary || typeof summary !== 'object') continue;
267
+ if (typeof summary.size === 'number' && summary.size > 0) {
268
+ rawTokens += Math.ceil(summary.size / CHARS_PER_TOKEN);
269
+ }
270
+ }
271
+ }
272
+
273
+ const savedTokens = Math.max(0, rawTokens - bundleTokens);
274
+ const savingsPercent = rawTokens > 0
275
+ ? Math.round((savedTokens / rawTokens) * 10000) / 100
276
+ : 0;
277
+
278
+ return {
279
+ bundleTokens,
280
+ rawTokens,
281
+ savedTokens,
282
+ savingsPercent,
283
+ method: tiktokenEncoder && !forceCharApprox ? 'tiktoken' : 'char-approx',
284
+ };
285
+ }
286
+
287
+ module.exports = {
288
+ estimateTokens,
289
+ calculateBundleSize,
290
+ estimateSavings,
291
+ };