edgar-cli 0.1.4 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -133
- package/bin/edgar-lib.cjs +118 -0
- package/bin/edgar.cjs +6 -0
- package/package.json +20 -34
- package/LICENSE +0 -21
- package/dist/cli.d.ts +0 -10
- package/dist/cli.js +0 -332
- package/dist/commands/facts.d.ts +0 -8
- package/dist/commands/facts.js +0 -125
- package/dist/commands/filings.d.ts +0 -14
- package/dist/commands/filings.js +0 -198
- package/dist/commands/research.d.ts +0 -33
- package/dist/commands/research.js +0 -698
- package/dist/commands/resolve.d.ts +0 -2
- package/dist/commands/resolve.js +0 -7
- package/dist/core/config.d.ts +0 -23
- package/dist/core/config.js +0 -51
- package/dist/core/envelope.d.ts +0 -28
- package/dist/core/envelope.js +0 -37
- package/dist/core/errors.d.ts +0 -18
- package/dist/core/errors.js +0 -37
- package/dist/core/output-shape.d.ts +0 -10
- package/dist/core/output-shape.js +0 -61
- package/dist/core/runtime.d.ts +0 -10
- package/dist/core/runtime.js +0 -1
- package/dist/sec/client.d.ts +0 -17
- package/dist/sec/client.js +0 -154
- package/dist/sec/endpoints.d.ts +0 -10
- package/dist/sec/endpoints.js +0 -19
- package/dist/sec/normalizers.d.ts +0 -5
- package/dist/sec/normalizers.js +0 -44
- package/dist/sec/ticker-map.d.ts +0 -16
- package/dist/sec/ticker-map.js +0 -57
|
@@ -1,698 +0,0 @@
|
|
|
1
|
-
import { mkdir, readFile, stat, writeFile } from 'node:fs/promises';
|
|
2
|
-
import os from 'node:os';
|
|
3
|
-
import path from 'node:path';
|
|
4
|
-
import { runFilingsGet, runFilingsList } from './filings.js';
|
|
5
|
-
import { CLIError, ErrorCode } from '../core/errors.js';
|
|
6
|
-
import { resolveEntity } from '../sec/ticker-map.js';
|
|
7
|
-
const PROFILE_RULES = {
|
|
8
|
-
core: [
|
|
9
|
-
{ form: '10-K', queryLimit: 1 },
|
|
10
|
-
{ form: '10-Q', queryLimit: 3 },
|
|
11
|
-
{ form: '8-K', queryLimit: 12, recentDays: 180 }
|
|
12
|
-
],
|
|
13
|
-
events: [{ form: '8-K', queryLimit: 24, recentDays: 365 }],
|
|
14
|
-
financials: [
|
|
15
|
-
{ form: '10-K', queryLimit: 2 },
|
|
16
|
-
{ form: '10-Q', queryLimit: 6 }
|
|
17
|
-
]
|
|
18
|
-
};
|
|
19
|
-
function nowIso() {
|
|
20
|
-
return new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
|
|
21
|
-
}
|
|
22
|
-
function formatDateUtc(date) {
|
|
23
|
-
return date.toISOString().slice(0, 10);
|
|
24
|
-
}
|
|
25
|
-
function dateDaysAgo(days) {
|
|
26
|
-
const date = new Date();
|
|
27
|
-
date.setUTCDate(date.getUTCDate() - days);
|
|
28
|
-
return formatDateUtc(date);
|
|
29
|
-
}
|
|
30
|
-
function defaultCacheRoot() {
|
|
31
|
-
if (process.env.EDGAR_CACHE_DIR && process.env.EDGAR_CACHE_DIR.trim().length > 0) {
|
|
32
|
-
return path.resolve(process.env.EDGAR_CACHE_DIR);
|
|
33
|
-
}
|
|
34
|
-
if (process.env.XDG_CACHE_HOME && process.env.XDG_CACHE_HOME.trim().length > 0) {
|
|
35
|
-
return path.resolve(process.env.XDG_CACHE_HOME, 'edgar-cli');
|
|
36
|
-
}
|
|
37
|
-
return path.resolve(os.homedir(), '.cache', 'edgar-cli');
|
|
38
|
-
}
|
|
39
|
-
function resolveCacheRoot(cacheDir) {
|
|
40
|
-
if (cacheDir && cacheDir.trim().length > 0) {
|
|
41
|
-
return path.resolve(cacheDir);
|
|
42
|
-
}
|
|
43
|
-
return defaultCacheRoot();
|
|
44
|
-
}
|
|
45
|
-
function companyCacheDir(cacheRoot, cik) {
|
|
46
|
-
return path.join(cacheRoot, 'research', 'companies', cik);
|
|
47
|
-
}
|
|
48
|
-
function profileManifestPath(cacheRoot, cik, profile) {
|
|
49
|
-
return path.join(companyCacheDir(cacheRoot, cik), 'profiles', `${profile}.json`);
|
|
50
|
-
}
|
|
51
|
-
function filingDocPath(cacheRoot, cik, accession) {
|
|
52
|
-
return path.join(companyCacheDir(cacheRoot, cik), 'filings', `${accession}.md`);
|
|
53
|
-
}
|
|
54
|
-
function parseCachedManifest(value) {
|
|
55
|
-
if (!value || typeof value !== 'object') {
|
|
56
|
-
throw new CLIError(ErrorCode.PARSE_ERROR, 'Cached manifest is malformed');
|
|
57
|
-
}
|
|
58
|
-
const manifest = value;
|
|
59
|
-
if (manifest.version !== 1 ||
|
|
60
|
-
typeof manifest.cik !== 'string' ||
|
|
61
|
-
!Array.isArray(manifest.docs) ||
|
|
62
|
-
!manifest.docs.every((doc) => doc && typeof doc.path === 'string' && typeof doc.accession === 'string')) {
|
|
63
|
-
throw new CLIError(ErrorCode.PARSE_ERROR, 'Cached manifest is malformed');
|
|
64
|
-
}
|
|
65
|
-
return manifest;
|
|
66
|
-
}
|
|
67
|
-
function normalizeForm(value) {
|
|
68
|
-
const normalized = value?.trim().toUpperCase();
|
|
69
|
-
return normalized && normalized.length > 0 ? normalized : undefined;
|
|
70
|
-
}
|
|
71
|
-
async function readCachedManifest(cacheRoot, cik, profile) {
|
|
72
|
-
const manifestPath = profileManifestPath(cacheRoot, cik, profile);
|
|
73
|
-
let raw;
|
|
74
|
-
try {
|
|
75
|
-
raw = await readFile(manifestPath, 'utf8');
|
|
76
|
-
}
|
|
77
|
-
catch (error) {
|
|
78
|
-
const err = error;
|
|
79
|
-
if (err.code === 'ENOENT') {
|
|
80
|
-
return null;
|
|
81
|
-
}
|
|
82
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read cached manifest ${manifestPath}: ${err.message}`);
|
|
83
|
-
}
|
|
84
|
-
let parsed;
|
|
85
|
-
try {
|
|
86
|
-
parsed = JSON.parse(raw);
|
|
87
|
-
}
|
|
88
|
-
catch {
|
|
89
|
-
throw new CLIError(ErrorCode.PARSE_ERROR, `Cached manifest is not valid JSON: ${manifestPath}`);
|
|
90
|
-
}
|
|
91
|
-
return parseCachedManifest(parsed);
|
|
92
|
-
}
|
|
93
|
-
async function writeCachedManifest(cacheRoot, manifest) {
|
|
94
|
-
const manifestPath = profileManifestPath(cacheRoot, manifest.cik, manifest.profile);
|
|
95
|
-
await mkdir(path.dirname(manifestPath), { recursive: true });
|
|
96
|
-
await writeFile(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`, 'utf8');
|
|
97
|
-
return { manifestPath };
|
|
98
|
-
}
|
|
99
|
-
async function fileExists(filePath) {
|
|
100
|
-
try {
|
|
101
|
-
const fileStat = await stat(filePath);
|
|
102
|
-
return fileStat.isFile();
|
|
103
|
-
}
|
|
104
|
-
catch (error) {
|
|
105
|
-
const err = error;
|
|
106
|
-
if (err.code === 'ENOENT') {
|
|
107
|
-
return false;
|
|
108
|
-
}
|
|
109
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to stat ${filePath}: ${err.message}`);
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
async function materializeCachedDocs(params) {
|
|
113
|
-
const docs = [];
|
|
114
|
-
const skipped = [];
|
|
115
|
-
let fetchedCount = 0;
|
|
116
|
-
let reusedCount = 0;
|
|
117
|
-
for (const row of params.rows) {
|
|
118
|
-
const docPath = filingDocPath(params.cacheRoot, params.cik, row.accession);
|
|
119
|
-
const shouldUseCache = !params.refresh && (await fileExists(docPath));
|
|
120
|
-
if (!shouldUseCache) {
|
|
121
|
-
try {
|
|
122
|
-
const filingResult = await runFilingsGet({
|
|
123
|
-
id: params.cik,
|
|
124
|
-
accession: row.accession,
|
|
125
|
-
format: 'markdown'
|
|
126
|
-
}, params.context);
|
|
127
|
-
const filingData = filingResult.data;
|
|
128
|
-
if (typeof filingData.content !== 'string') {
|
|
129
|
-
throw new CLIError(ErrorCode.PARSE_ERROR, `Unable to parse markdown content for accession ${row.accession}`);
|
|
130
|
-
}
|
|
131
|
-
await mkdir(path.dirname(docPath), { recursive: true });
|
|
132
|
-
const content = filingData.content.endsWith('\n') ? filingData.content : `${filingData.content}\n`;
|
|
133
|
-
await writeFile(docPath, content, 'utf8');
|
|
134
|
-
fetchedCount += 1;
|
|
135
|
-
}
|
|
136
|
-
catch (error) {
|
|
137
|
-
if (error instanceof CLIError && error.code === ErrorCode.NOT_FOUND) {
|
|
138
|
-
skipped.push({ accession: row.accession, reason: error.message });
|
|
139
|
-
continue;
|
|
140
|
-
}
|
|
141
|
-
throw error;
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
else {
|
|
145
|
-
reusedCount += 1;
|
|
146
|
-
}
|
|
147
|
-
docs.push({
|
|
148
|
-
accession: row.accession,
|
|
149
|
-
form: row.form,
|
|
150
|
-
filing_date: row.filingDate,
|
|
151
|
-
report_date: row.reportDate,
|
|
152
|
-
filing_url: row.filingUrl,
|
|
153
|
-
path: docPath
|
|
154
|
-
});
|
|
155
|
-
}
|
|
156
|
-
return {
|
|
157
|
-
docs,
|
|
158
|
-
fetchedCount,
|
|
159
|
-
reusedCount,
|
|
160
|
-
skipped
|
|
161
|
-
};
|
|
162
|
-
}
|
|
163
|
-
export function parseResearchProfile(value) {
|
|
164
|
-
const normalized = value.trim().toLowerCase();
|
|
165
|
-
if (normalized === 'core' || normalized === 'events' || normalized === 'financials') {
|
|
166
|
-
return normalized;
|
|
167
|
-
}
|
|
168
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, '--profile must be one of core|events|financials');
|
|
169
|
-
}
|
|
170
|
-
function tokenize(value) {
|
|
171
|
-
return (value.toLowerCase().match(/[a-z0-9]+/g) ?? []).filter((token) => token.length >= 2);
|
|
172
|
-
}
|
|
173
|
-
const QUERY_STOPWORDS = new Set([
|
|
174
|
-
'a',
|
|
175
|
-
'an',
|
|
176
|
-
'and',
|
|
177
|
-
'are',
|
|
178
|
-
'as',
|
|
179
|
-
'at',
|
|
180
|
-
'be',
|
|
181
|
-
'by',
|
|
182
|
-
'for',
|
|
183
|
-
'from',
|
|
184
|
-
'how',
|
|
185
|
-
'in',
|
|
186
|
-
'into',
|
|
187
|
-
'is',
|
|
188
|
-
'it',
|
|
189
|
-
'its',
|
|
190
|
-
'of',
|
|
191
|
-
'on',
|
|
192
|
-
'or',
|
|
193
|
-
'that',
|
|
194
|
-
'the',
|
|
195
|
-
'their',
|
|
196
|
-
'there',
|
|
197
|
-
'these',
|
|
198
|
-
'they',
|
|
199
|
-
'this',
|
|
200
|
-
'to',
|
|
201
|
-
'was',
|
|
202
|
-
'were',
|
|
203
|
-
'what',
|
|
204
|
-
'when',
|
|
205
|
-
'where',
|
|
206
|
-
'which',
|
|
207
|
-
'who',
|
|
208
|
-
'why',
|
|
209
|
-
'with'
|
|
210
|
-
]);
|
|
211
|
-
const COVER_BOILERPLATE_PATTERNS = [
|
|
212
|
-
/securities registered pursuant to section 12\(b\)/i,
|
|
213
|
-
/indicate by check mark/i,
|
|
214
|
-
/commission file number/i,
|
|
215
|
-
/for the quarterly period ended/i,
|
|
216
|
-
/for the fiscal year ended/i
|
|
217
|
-
];
|
|
218
|
-
function uniqueTokens(tokens) {
|
|
219
|
-
return [...new Set(tokens)];
|
|
220
|
-
}
|
|
221
|
-
function buildQueryTerms(query) {
|
|
222
|
-
const rawTokens = tokenize(query);
|
|
223
|
-
const filtered = rawTokens.filter((token) => !QUERY_STOPWORDS.has(token));
|
|
224
|
-
const terms = filtered.length > 0 ? filtered : rawTokens;
|
|
225
|
-
return uniqueTokens(terms);
|
|
226
|
-
}
|
|
227
|
-
function buildQueryBigrams(queryTerms) {
|
|
228
|
-
const bigrams = [];
|
|
229
|
-
for (let idx = 0; idx < queryTerms.length - 1; idx += 1) {
|
|
230
|
-
bigrams.push(`${queryTerms[idx]} ${queryTerms[idx + 1]}`);
|
|
231
|
-
}
|
|
232
|
-
return uniqueTokens(bigrams);
|
|
233
|
-
}
|
|
234
|
-
function countTermHits(queryTerms, termFrequency) {
|
|
235
|
-
return queryTerms.reduce((hits, term) => hits + ((termFrequency.get(term) ?? 0) > 0 ? 1 : 0), 0);
|
|
236
|
-
}
|
|
237
|
-
function countBigramHits(chunkText, queryBigrams) {
|
|
238
|
-
if (queryBigrams.length === 0) {
|
|
239
|
-
return 0;
|
|
240
|
-
}
|
|
241
|
-
const text = chunkText.toLowerCase();
|
|
242
|
-
return queryBigrams.reduce((hits, bigram) => hits + (text.includes(bigram) ? 1 : 0), 0);
|
|
243
|
-
}
|
|
244
|
-
function looksLikeCoverBoilerplate(chunk) {
|
|
245
|
-
if (chunk.lineStart > 140) {
|
|
246
|
-
return false;
|
|
247
|
-
}
|
|
248
|
-
return COVER_BOILERPLATE_PATTERNS.some((pattern) => pattern.test(chunk.text));
|
|
249
|
-
}
|
|
250
|
-
function buildTermFrequency(tokens) {
|
|
251
|
-
const frequency = new Map();
|
|
252
|
-
for (const token of tokens) {
|
|
253
|
-
frequency.set(token, (frequency.get(token) ?? 0) + 1);
|
|
254
|
-
}
|
|
255
|
-
return frequency;
|
|
256
|
-
}
|
|
257
|
-
function extractAccession(docPath) {
|
|
258
|
-
const match = docPath.match(/\d{10}-\d{2}-\d{6}/);
|
|
259
|
-
return match?.[0] ?? null;
|
|
260
|
-
}
|
|
261
|
-
function parseManifest(value) {
|
|
262
|
-
if (Array.isArray(value) && value.every((entry) => typeof entry === 'string')) {
|
|
263
|
-
return { docs: value };
|
|
264
|
-
}
|
|
265
|
-
if (value &&
|
|
266
|
-
typeof value === 'object' &&
|
|
267
|
-
Array.isArray(value.docs) &&
|
|
268
|
-
value.docs.every((entry) => typeof entry === 'string')) {
|
|
269
|
-
return { docs: value.docs };
|
|
270
|
-
}
|
|
271
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Manifest must be a JSON array of strings or object with a docs string array');
|
|
272
|
-
}
|
|
273
|
-
async function loadDocPaths(params) {
|
|
274
|
-
const fromOptions = params.docs.map((docPath) => docPath.trim()).filter((docPath) => docPath.length > 0);
|
|
275
|
-
const fromManifest = [];
|
|
276
|
-
if (params.manifestPath) {
|
|
277
|
-
const resolvedManifestPath = path.resolve(params.manifestPath);
|
|
278
|
-
let manifestRaw;
|
|
279
|
-
try {
|
|
280
|
-
manifestRaw = await readFile(resolvedManifestPath, 'utf8');
|
|
281
|
-
}
|
|
282
|
-
catch (error) {
|
|
283
|
-
const err = error;
|
|
284
|
-
if (err.code === 'ENOENT') {
|
|
285
|
-
throw new CLIError(ErrorCode.NOT_FOUND, `Manifest not found: ${resolvedManifestPath}`);
|
|
286
|
-
}
|
|
287
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read manifest ${resolvedManifestPath}: ${err.message}`);
|
|
288
|
-
}
|
|
289
|
-
let manifestJson;
|
|
290
|
-
try {
|
|
291
|
-
manifestJson = JSON.parse(manifestRaw);
|
|
292
|
-
}
|
|
293
|
-
catch {
|
|
294
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, `Manifest is not valid JSON: ${resolvedManifestPath}`);
|
|
295
|
-
}
|
|
296
|
-
const parsed = parseManifest(manifestJson);
|
|
297
|
-
fromManifest.push(...parsed.docs.map((docPath) => docPath.trim()).filter((docPath) => docPath.length > 0));
|
|
298
|
-
}
|
|
299
|
-
const resolved = [...fromOptions, ...fromManifest].map((docPath) => path.resolve(docPath));
|
|
300
|
-
return [...new Set(resolved)];
|
|
301
|
-
}
|
|
302
|
-
async function ensureReadableTextFile(filePath) {
|
|
303
|
-
let fileStat;
|
|
304
|
-
try {
|
|
305
|
-
fileStat = await stat(filePath);
|
|
306
|
-
}
|
|
307
|
-
catch (error) {
|
|
308
|
-
const err = error;
|
|
309
|
-
if (err.code === 'ENOENT') {
|
|
310
|
-
throw new CLIError(ErrorCode.NOT_FOUND, `Document not found: ${filePath}`);
|
|
311
|
-
}
|
|
312
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to stat document ${filePath}: ${err.message}`);
|
|
313
|
-
}
|
|
314
|
-
if (!fileStat.isFile()) {
|
|
315
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, `Path is not a file: ${filePath}`);
|
|
316
|
-
}
|
|
317
|
-
let content;
|
|
318
|
-
try {
|
|
319
|
-
content = await readFile(filePath, 'utf8');
|
|
320
|
-
}
|
|
321
|
-
catch (error) {
|
|
322
|
-
const err = error;
|
|
323
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read document ${filePath}: ${err.message}`);
|
|
324
|
-
}
|
|
325
|
-
if (content.includes('\u0000')) {
|
|
326
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, `File appears to be binary: ${filePath}`);
|
|
327
|
-
}
|
|
328
|
-
return content;
|
|
329
|
-
}
|
|
330
|
-
function chunkDocument(params) {
|
|
331
|
-
const lines = params.content.split(/\r?\n/);
|
|
332
|
-
const step = Math.max(1, params.chunkLines - params.chunkOverlap);
|
|
333
|
-
const chunks = [];
|
|
334
|
-
const accession = extractAccession(params.docPath);
|
|
335
|
-
for (let lineIdx = 0; lineIdx < lines.length; lineIdx += step) {
|
|
336
|
-
const start = lineIdx;
|
|
337
|
-
const endExclusive = Math.min(lines.length, start + params.chunkLines);
|
|
338
|
-
const chunkLines = lines.slice(start, endExclusive);
|
|
339
|
-
const text = chunkLines.join('\n').trim();
|
|
340
|
-
if (text.length === 0) {
|
|
341
|
-
if (endExclusive >= lines.length) {
|
|
342
|
-
break;
|
|
343
|
-
}
|
|
344
|
-
continue;
|
|
345
|
-
}
|
|
346
|
-
const tokens = tokenize(text);
|
|
347
|
-
chunks.push({
|
|
348
|
-
docPath: params.docPath,
|
|
349
|
-
accession,
|
|
350
|
-
lineStart: start + 1,
|
|
351
|
-
lineEnd: endExclusive,
|
|
352
|
-
text,
|
|
353
|
-
tokenCount: tokens.length,
|
|
354
|
-
termFrequency: buildTermFrequency(tokens)
|
|
355
|
-
});
|
|
356
|
-
if (endExclusive >= lines.length) {
|
|
357
|
-
break;
|
|
358
|
-
}
|
|
359
|
-
}
|
|
360
|
-
return chunks;
|
|
361
|
-
}
|
|
362
|
-
function bm25Score(params) {
|
|
363
|
-
const k1 = 1.2;
|
|
364
|
-
const b = 0.75;
|
|
365
|
-
return params.queryTerms.reduce((score, term) => {
|
|
366
|
-
const tf = params.chunk.termFrequency.get(term) ?? 0;
|
|
367
|
-
if (tf === 0) {
|
|
368
|
-
return score;
|
|
369
|
-
}
|
|
370
|
-
const df = params.docFrequencyByTerm.get(term) ?? 0;
|
|
371
|
-
const idf = Math.log(1 + (params.totalChunkCount - df + 0.5) / (df + 0.5));
|
|
372
|
-
const normalizedLength = params.averageChunkLength > 0 ? params.chunk.tokenCount / params.averageChunkLength : 1;
|
|
373
|
-
const denominator = tf + k1 * (1 - b + b * normalizedLength);
|
|
374
|
-
const termScore = idf * ((tf * (k1 + 1)) / denominator);
|
|
375
|
-
return score + termScore;
|
|
376
|
-
}, 0);
|
|
377
|
-
}
|
|
378
|
-
function adjustedChunkScore(params) {
|
|
379
|
-
if (params.baseScore <= 0) {
|
|
380
|
-
return 0;
|
|
381
|
-
}
|
|
382
|
-
const termHits = countTermHits(params.queryTerms, params.chunk.termFrequency);
|
|
383
|
-
if (params.queryTerms.length >= 3 && termHits < 2) {
|
|
384
|
-
return 0;
|
|
385
|
-
}
|
|
386
|
-
const coverage = termHits / Math.max(1, params.queryTerms.length);
|
|
387
|
-
const bigramHits = countBigramHits(params.chunk.text, params.queryBigrams);
|
|
388
|
-
let multiplier = 1;
|
|
389
|
-
if (coverage >= 1) {
|
|
390
|
-
multiplier *= 1.25;
|
|
391
|
-
}
|
|
392
|
-
else if (coverage >= 0.7) {
|
|
393
|
-
multiplier *= 1.15;
|
|
394
|
-
}
|
|
395
|
-
else if (coverage >= 0.5) {
|
|
396
|
-
multiplier *= 1.08;
|
|
397
|
-
}
|
|
398
|
-
else if (params.queryTerms.length >= 3 && coverage <= 0.25) {
|
|
399
|
-
multiplier *= 0.8;
|
|
400
|
-
}
|
|
401
|
-
if (bigramHits > 0) {
|
|
402
|
-
multiplier *= 1 + Math.min(0.24, bigramHits * 0.08);
|
|
403
|
-
}
|
|
404
|
-
if (looksLikeCoverBoilerplate(params.chunk)) {
|
|
405
|
-
multiplier *= 0.45;
|
|
406
|
-
}
|
|
407
|
-
return params.baseScore * multiplier;
|
|
408
|
-
}
|
|
409
|
-
function compactWhitespace(value) {
|
|
410
|
-
return value.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
|
|
411
|
-
}
|
|
412
|
-
function trimExcerpt(value, maxChars) {
|
|
413
|
-
if (value.length <= maxChars) {
|
|
414
|
-
return value;
|
|
415
|
-
}
|
|
416
|
-
return `${value.slice(0, Math.max(0, maxChars - 3)).trimEnd()}...`;
|
|
417
|
-
}
|
|
418
|
-
async function runLexicalSearch(params) {
|
|
419
|
-
const query = params.query.trim();
|
|
420
|
-
if (query.length === 0) {
|
|
421
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Query must not be empty');
|
|
422
|
-
}
|
|
423
|
-
if (params.chunkOverlap >= params.chunkLines) {
|
|
424
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, '--chunk-overlap must be less than --chunk-lines');
|
|
425
|
-
}
|
|
426
|
-
const docs = await Promise.all(params.docPaths.map(async (docPath) => {
|
|
427
|
-
const content = await ensureReadableTextFile(docPath);
|
|
428
|
-
return {
|
|
429
|
-
path: docPath,
|
|
430
|
-
bytes: Buffer.byteLength(content, 'utf8'),
|
|
431
|
-
lineCount: content.split(/\r?\n/).length,
|
|
432
|
-
chunks: chunkDocument({
|
|
433
|
-
docPath,
|
|
434
|
-
content,
|
|
435
|
-
chunkLines: params.chunkLines,
|
|
436
|
-
chunkOverlap: params.chunkOverlap
|
|
437
|
-
})
|
|
438
|
-
};
|
|
439
|
-
}));
|
|
440
|
-
const allChunks = docs.flatMap((doc) => doc.chunks);
|
|
441
|
-
if (allChunks.length === 0) {
|
|
442
|
-
return {
|
|
443
|
-
data: {
|
|
444
|
-
query,
|
|
445
|
-
backend: 'lexical',
|
|
446
|
-
docs: docs.map((doc) => ({
|
|
447
|
-
path: doc.path,
|
|
448
|
-
bytes: doc.bytes,
|
|
449
|
-
line_count: doc.lineCount
|
|
450
|
-
})),
|
|
451
|
-
result_count: 0,
|
|
452
|
-
results: []
|
|
453
|
-
}
|
|
454
|
-
};
|
|
455
|
-
}
|
|
456
|
-
const queryTerms = buildQueryTerms(query);
|
|
457
|
-
if (queryTerms.length === 0) {
|
|
458
|
-
throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Query must contain at least one alphanumeric token');
|
|
459
|
-
}
|
|
460
|
-
const queryBigrams = buildQueryBigrams(queryTerms);
|
|
461
|
-
const docFrequencyByTerm = new Map();
|
|
462
|
-
for (const term of queryTerms) {
|
|
463
|
-
let count = 0;
|
|
464
|
-
for (const chunk of allChunks) {
|
|
465
|
-
if ((chunk.termFrequency.get(term) ?? 0) > 0) {
|
|
466
|
-
count += 1;
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
docFrequencyByTerm.set(term, count);
|
|
470
|
-
}
|
|
471
|
-
const averageChunkLength = allChunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0) / Math.max(allChunks.length, 1);
|
|
472
|
-
const scored = allChunks
|
|
473
|
-
.map((chunk) => {
|
|
474
|
-
const baseScore = bm25Score({
|
|
475
|
-
queryTerms,
|
|
476
|
-
chunk,
|
|
477
|
-
docFrequencyByTerm,
|
|
478
|
-
totalChunkCount: allChunks.length,
|
|
479
|
-
averageChunkLength
|
|
480
|
-
});
|
|
481
|
-
return {
|
|
482
|
-
chunk,
|
|
483
|
-
score: adjustedChunkScore({
|
|
484
|
-
chunk,
|
|
485
|
-
baseScore,
|
|
486
|
-
queryTerms,
|
|
487
|
-
queryBigrams
|
|
488
|
-
})
|
|
489
|
-
};
|
|
490
|
-
})
|
|
491
|
-
.filter((item) => item.score > 0)
|
|
492
|
-
.sort((a, b) => b.score - a.score)
|
|
493
|
-
.slice(0, params.topK);
|
|
494
|
-
return {
|
|
495
|
-
data: {
|
|
496
|
-
query,
|
|
497
|
-
backend: 'lexical',
|
|
498
|
-
query_terms: queryTerms,
|
|
499
|
-
docs: docs.map((doc) => ({
|
|
500
|
-
path: doc.path,
|
|
501
|
-
bytes: doc.bytes,
|
|
502
|
-
line_count: doc.lineCount
|
|
503
|
-
})),
|
|
504
|
-
chunk_count: allChunks.length,
|
|
505
|
-
result_count: scored.length,
|
|
506
|
-
results: scored.map((item, idx) => ({
|
|
507
|
-
rank: idx + 1,
|
|
508
|
-
score: Number(item.score.toFixed(6)),
|
|
509
|
-
path: item.chunk.docPath,
|
|
510
|
-
accession: item.chunk.accession,
|
|
511
|
-
line_start: item.chunk.lineStart,
|
|
512
|
-
line_end: item.chunk.lineEnd,
|
|
513
|
-
excerpt: trimExcerpt(compactWhitespace(item.chunk.text), 1200)
|
|
514
|
-
}))
|
|
515
|
-
}
|
|
516
|
-
};
|
|
517
|
-
}
|
|
518
|
-
export async function runResearchSync(params, context) {
|
|
519
|
-
const entity = await resolveEntity(params.id, context.secClient, { strictMapMatch: false });
|
|
520
|
-
const cacheRoot = resolveCacheRoot(params.cacheDir);
|
|
521
|
-
const rules = PROFILE_RULES[params.profile];
|
|
522
|
-
const selectedByAccession = new Map();
|
|
523
|
-
for (const rule of rules) {
|
|
524
|
-
const listResult = await runFilingsList({
|
|
525
|
-
id: entity.cik,
|
|
526
|
-
form: rule.form,
|
|
527
|
-
from: rule.recentDays ? dateDaysAgo(rule.recentDays) : undefined,
|
|
528
|
-
queryLimit: rule.queryLimit
|
|
529
|
-
}, context);
|
|
530
|
-
const rows = listResult.data;
|
|
531
|
-
for (const row of rows) {
|
|
532
|
-
if (!selectedByAccession.has(row.accession)) {
|
|
533
|
-
selectedByAccession.set(row.accession, row);
|
|
534
|
-
}
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
const selectedRows = [...selectedByAccession.values()].sort((a, b) => (b.filingDate ?? '').localeCompare(a.filingDate ?? ''));
|
|
538
|
-
const { docs, fetchedCount, reusedCount, skipped } = await materializeCachedDocs({
|
|
539
|
-
cacheRoot,
|
|
540
|
-
cik: entity.cik,
|
|
541
|
-
rows: selectedRows,
|
|
542
|
-
refresh: params.refresh,
|
|
543
|
-
context
|
|
544
|
-
});
|
|
545
|
-
const manifest = {
|
|
546
|
-
version: 1,
|
|
547
|
-
id_input: params.id,
|
|
548
|
-
cik: entity.cik,
|
|
549
|
-
ticker: entity.ticker,
|
|
550
|
-
title: entity.title,
|
|
551
|
-
profile: params.profile,
|
|
552
|
-
synced_at: nowIso(),
|
|
553
|
-
docs
|
|
554
|
-
};
|
|
555
|
-
const { manifestPath } = await writeCachedManifest(cacheRoot, manifest);
|
|
556
|
-
return {
|
|
557
|
-
data: {
|
|
558
|
-
id: params.id,
|
|
559
|
-
cik: entity.cik,
|
|
560
|
-
ticker: entity.ticker,
|
|
561
|
-
title: entity.title,
|
|
562
|
-
profile: params.profile,
|
|
563
|
-
cache_root: cacheRoot,
|
|
564
|
-
manifest_path: manifestPath,
|
|
565
|
-
docs_count: docs.length,
|
|
566
|
-
fetched_count: fetchedCount,
|
|
567
|
-
reused_count: reusedCount,
|
|
568
|
-
skipped_count: skipped.length,
|
|
569
|
-
skipped,
|
|
570
|
-
docs
|
|
571
|
-
}
|
|
572
|
-
};
|
|
573
|
-
}
|
|
574
|
-
export async function runResearchAsk(params, context) {
|
|
575
|
-
void context;
|
|
576
|
-
const docPaths = await loadDocPaths({ docs: params.docs, manifestPath: params.manifestPath });
|
|
577
|
-
if (docPaths.length === 0) {
|
|
578
|
-
throw new CLIError(ErrorCode.DOCS_REQUIRED, 'At least one document is required. Pass --doc <path> and/or --manifest <path>.');
|
|
579
|
-
}
|
|
580
|
-
return runLexicalSearch({
|
|
581
|
-
query: params.query,
|
|
582
|
-
docPaths,
|
|
583
|
-
topK: params.topK,
|
|
584
|
-
chunkLines: params.chunkLines,
|
|
585
|
-
chunkOverlap: params.chunkOverlap
|
|
586
|
-
});
|
|
587
|
-
}
|
|
588
|
-
export async function runResearchAskById(params, context) {
|
|
589
|
-
const cacheRoot = resolveCacheRoot(params.cacheDir);
|
|
590
|
-
const entity = await resolveEntity(params.id, context.secClient, { strictMapMatch: false });
|
|
591
|
-
const form = normalizeForm(params.scope?.form);
|
|
592
|
-
if (form || params.scope?.latest !== undefined) {
|
|
593
|
-
const latest = params.scope?.latest;
|
|
594
|
-
const listResult = await runFilingsList({
|
|
595
|
-
id: entity.cik,
|
|
596
|
-
form,
|
|
597
|
-
queryLimit: latest
|
|
598
|
-
}, context);
|
|
599
|
-
const selectedRows = listResult.data;
|
|
600
|
-
if (selectedRows.length === 0) {
|
|
601
|
-
const formLabel = form ?? 'any form';
|
|
602
|
-
throw new CLIError(ErrorCode.NOT_FOUND, `No filings found for ${params.id} using ${formLabel}.`);
|
|
603
|
-
}
|
|
604
|
-
const { docs, fetchedCount, reusedCount, skipped } = await materializeCachedDocs({
|
|
605
|
-
cacheRoot,
|
|
606
|
-
cik: entity.cik,
|
|
607
|
-
rows: selectedRows,
|
|
608
|
-
refresh: params.refresh,
|
|
609
|
-
context
|
|
610
|
-
});
|
|
611
|
-
if (docs.length === 0) {
|
|
612
|
-
throw new CLIError(ErrorCode.DOCS_REQUIRED, `No queryable filings were fetched for ${params.id}.`);
|
|
613
|
-
}
|
|
614
|
-
const docPaths = docs.map((doc) => doc.path);
|
|
615
|
-
const searchResult = await runLexicalSearch({
|
|
616
|
-
query: params.query,
|
|
617
|
-
docPaths,
|
|
618
|
-
topK: params.topK,
|
|
619
|
-
chunkLines: params.chunkLines,
|
|
620
|
-
chunkOverlap: params.chunkOverlap
|
|
621
|
-
});
|
|
622
|
-
const searchData = searchResult.data;
|
|
623
|
-
return {
|
|
624
|
-
data: {
|
|
625
|
-
...searchData,
|
|
626
|
-
id: params.id,
|
|
627
|
-
cik: entity.cik,
|
|
628
|
-
ticker: entity.ticker,
|
|
629
|
-
title: entity.title,
|
|
630
|
-
cache_root: cacheRoot,
|
|
631
|
-
scope: {
|
|
632
|
-
form: form ?? null,
|
|
633
|
-
latest: latest ?? null
|
|
634
|
-
},
|
|
635
|
-
corpus_docs_count: docs.length,
|
|
636
|
-
selected_filings: docs,
|
|
637
|
-
sync: {
|
|
638
|
-
fetched_count: fetchedCount,
|
|
639
|
-
reused_count: reusedCount,
|
|
640
|
-
docs_count: docs.length,
|
|
641
|
-
skipped_count: skipped.length,
|
|
642
|
-
skipped
|
|
643
|
-
}
|
|
644
|
-
}
|
|
645
|
-
};
|
|
646
|
-
}
|
|
647
|
-
let manifest = !params.refresh
|
|
648
|
-
? await readCachedManifest(cacheRoot, entity.cik, params.profile)
|
|
649
|
-
: null;
|
|
650
|
-
let syncData = null;
|
|
651
|
-
if (!manifest || manifest.docs.length === 0) {
|
|
652
|
-
const syncResult = await runResearchSync({
|
|
653
|
-
id: params.id,
|
|
654
|
-
profile: params.profile,
|
|
655
|
-
cacheDir: params.cacheDir,
|
|
656
|
-
refresh: params.refresh
|
|
657
|
-
}, context);
|
|
658
|
-
const syncPayload = syncResult.data;
|
|
659
|
-
syncData = {
|
|
660
|
-
fetched_count: typeof syncPayload.fetched_count === 'number' ? syncPayload.fetched_count : 0,
|
|
661
|
-
reused_count: typeof syncPayload.reused_count === 'number' ? syncPayload.reused_count : 0,
|
|
662
|
-
docs_count: typeof syncPayload.docs_count === 'number' ? syncPayload.docs_count : 0,
|
|
663
|
-
skipped_count: typeof syncPayload.skipped_count === 'number' ? syncPayload.skipped_count : 0
|
|
664
|
-
};
|
|
665
|
-
manifest = await readCachedManifest(cacheRoot, entity.cik, params.profile);
|
|
666
|
-
}
|
|
667
|
-
if (!manifest || manifest.docs.length === 0) {
|
|
668
|
-
throw new CLIError(ErrorCode.DOCS_REQUIRED, `No cached documents found for ${params.id} profile ${params.profile}. Run research sync first.`);
|
|
669
|
-
}
|
|
670
|
-
const docPaths = manifest.docs.map((doc) => doc.path);
|
|
671
|
-
const searchResult = await runLexicalSearch({
|
|
672
|
-
query: params.query,
|
|
673
|
-
docPaths,
|
|
674
|
-
topK: params.topK,
|
|
675
|
-
chunkLines: params.chunkLines,
|
|
676
|
-
chunkOverlap: params.chunkOverlap
|
|
677
|
-
});
|
|
678
|
-
const searchData = searchResult.data;
|
|
679
|
-
return {
|
|
680
|
-
data: {
|
|
681
|
-
...searchData,
|
|
682
|
-
id: params.id,
|
|
683
|
-
cik: entity.cik,
|
|
684
|
-
ticker: entity.ticker,
|
|
685
|
-
title: entity.title,
|
|
686
|
-
profile: params.profile,
|
|
687
|
-
cache_root: cacheRoot,
|
|
688
|
-
manifest_path: profileManifestPath(cacheRoot, entity.cik, params.profile),
|
|
689
|
-
corpus_docs_count: manifest.docs.length,
|
|
690
|
-
sync: syncData ?? {
|
|
691
|
-
fetched_count: 0,
|
|
692
|
-
reused_count: manifest.docs.length,
|
|
693
|
-
docs_count: manifest.docs.length,
|
|
694
|
-
skipped_count: 0
|
|
695
|
-
}
|
|
696
|
-
}
|
|
697
|
-
};
|
|
698
|
-
}
|