edgar-cli 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -12,6 +12,8 @@ Agent-friendly SEC EDGAR CLI for filings and company facts.
12
12
  - `filings list`
13
13
  - `filings get`
14
14
  - `facts get`
15
+ - `research sync`
16
+ - `research ask`
15
17
 
16
18
  ## Install / Run
17
19
 
@@ -57,10 +59,38 @@ npx edgar-cli --user-agent "Your Name your.email@example.com" filings list --id
57
59
  # Get filing document URL by accession
58
60
  npx edgar-cli --user-agent "Your Name your.email@example.com" filings get --id AAPL --accession 0000320193-26-000006 --format url
59
61
 
62
+ # Get filing converted to Markdown
63
+ npx edgar-cli --user-agent "Your Name your.email@example.com" filings get --id AAPL --accession 0000320193-26-000006 --format markdown
64
+
60
65
  # Get concept data (latest per unit)
61
66
  npx edgar-cli --user-agent "Your Name your.email@example.com" facts get --id AAPL --taxonomy us-gaap --concept Revenues --latest
67
+
68
+ # Query explicit local docs (repeat --doc or pass --manifest)
69
+ npx edgar-cli research ask "board resignation details" --doc ./cache/nvda-8k.md --top-k 5
70
+
71
+ # Build a deterministic cached corpus for a ticker/profile
72
+ npx edgar-cli --user-agent "Your Name your.email@example.com" research sync --id NVDA --profile core
73
+
74
+ # Query by ticker against cached corpus (auto-syncs on cache miss)
75
+ npx edgar-cli --user-agent "Your Name your.email@example.com" research ask "what changed on the board?" --id NVDA --profile core
62
76
  ```
63
77
 
78
+ ## Research Profiles and Cache
79
+
80
+ `research sync` and `research ask --id` use deterministic filing profiles:
81
+
82
+ - `core`: latest 1x `10-K`, latest 3x `10-Q`, and recent `8-K` (last 180 days, up to 12)
83
+ - `events`: recent `8-K` (last 365 days, up to 24)
84
+ - `financials`: latest 2x `10-K` and latest 6x `10-Q`
85
+
86
+ By default, cached corpora are stored in:
87
+
88
+ - `$EDGAR_CACHE_DIR` (if set), else
89
+ - `$XDG_CACHE_HOME/edgar-cli` (if set), else
90
+ - `~/.cache/edgar-cli`
91
+
92
+ Override per command with `--cache-dir`.
93
+
64
94
  ## Output Contract (default)
65
95
 
66
96
  All JSON-mode commands emit:
package/dist/cli.js CHANGED
@@ -1,8 +1,10 @@
1
1
  #!/usr/bin/env node
2
+ import { realpathSync } from 'node:fs';
2
3
  import { Command, CommanderError } from 'commander';
3
- import { pathToFileURL } from 'node:url';
4
+ import { fileURLToPath } from 'node:url';
4
5
  import { runFactsGet } from './commands/facts.js';
5
6
  import { runFilingsGet, runFilingsList } from './commands/filings.js';
7
+ import { parseResearchProfile, runResearchAsk, runResearchAskById, runResearchSync } from './commands/research.js';
6
8
  import { runResolve } from './commands/resolve.js';
7
9
  import { buildRuntimeOptions, parseDateString, parseNonNegativeInt, parsePositiveInt, requireUserAgent } from './core/config.js';
8
10
  import { failureEnvelope, successEnvelope } from './core/envelope.js';
@@ -72,7 +74,7 @@ function toCliError(err) {
72
74
  }
73
75
  return new CLIError(ErrorCode.INTERNAL_ERROR, err.message || 'Unexpected error');
74
76
  }
75
- async function executeCommand(command, commandObj, io, handler) {
77
+ async function executeCommand(command, commandObj, io, handler, options) {
76
78
  const globalOptions = commandObj.optsWithGlobals();
77
79
  const runtime = buildRuntimeOptions({
78
80
  json: globalOptions.json,
@@ -84,7 +86,10 @@ async function executeCommand(command, commandObj, io, handler) {
84
86
  userAgent: globalOptions.userAgent
85
87
  }, io.env);
86
88
  try {
87
- const userAgent = requireUserAgent(runtime.userAgent);
89
+ const requiresSecIdentity = options?.requiresSecIdentity ?? true;
90
+ const userAgent = requiresSecIdentity
91
+ ? requireUserAgent(runtime.userAgent)
92
+ : runtime.userAgent ?? 'edgar-cli local research';
88
93
  const secClient = new SecClient({
89
94
  userAgent,
90
95
  verbose: runtime.verbose,
@@ -164,13 +169,13 @@ export function buildProgram(io) {
164
169
  .command('get')
165
170
  .requiredOption('--id <id>', 'Ticker or CIK')
166
171
  .requiredOption('--accession <accession>', 'Accession number: XXXXXXXXXX-XX-XXXXXX')
167
- .option('--format <format>', 'url|html|text', 'url')
172
+ .option('--format <format>', 'url|html|text|markdown', 'url')
168
173
  .action(async function actionFilingsGet(options) {
169
174
  const format = options.format;
170
- if (!['url', 'html', 'text'].includes(format)) {
175
+ if (!['url', 'html', 'text', 'markdown'].includes(format)) {
171
176
  throw new CLIAbortError(emitError({
172
177
  command: 'filings get',
173
- err: new CLIError(ErrorCode.VALIDATION_ERROR, '--format must be one of url|html|text'),
178
+ err: new CLIError(ErrorCode.VALIDATION_ERROR, '--format must be one of url|html|text|markdown'),
174
179
  runtimeView: 'summary',
175
180
  humanMode: false,
176
181
  io
@@ -209,8 +214,69 @@ export function buildProgram(io) {
209
214
  latest: Boolean(options.latest)
210
215
  }, context));
211
216
  });
217
+ const research = program
218
+ .command('research')
219
+ .description('Run deterministic research workflows over explicit docs or cached filing profiles');
220
+ research
221
+ .command('sync')
222
+ .description('Cache a deterministic research corpus for a company/profile')
223
+ .requiredOption('--id <id>', 'Ticker or CIK')
224
+ .option('--profile <profile>', 'core|events|financials', 'core')
225
+ .option('--cache-dir <path>', 'Override cache directory')
226
+ .option('--refresh', 'Force refetch even when cached docs exist')
227
+ .action(async function actionResearchSync(options) {
228
+ const profile = parseResearchProfile(options.profile);
229
+ await executeCommand('research sync', this, io, async (context) => runResearchSync({
230
+ id: options.id,
231
+ profile,
232
+ cacheDir: options.cacheDir,
233
+ refresh: Boolean(options.refresh)
234
+ }, context), { requiresSecIdentity: true });
235
+ });
236
+ research
237
+ .command('ask')
238
+ .description('Query explicitly provided local docs, or a cached company profile corpus when --id is used')
239
+ .argument('<query>', 'Natural language query')
240
+ .option('--id <id>', 'Ticker or CIK for cached/profile-based research')
241
+ .option('--profile <profile>', 'core|events|financials (used with --id)', 'core')
242
+ .option('--cache-dir <path>', 'Override cache directory')
243
+ .option('--refresh', 'With --id, force refetch of filings before querying')
244
+ .option('--doc <path>', 'Path to a local document (repeatable)', collectValues, [])
245
+ .option('--manifest <path>', 'Path to JSON manifest: either ["doc1", ...] or {"docs": ["doc1", ...]}')
246
+ .option('--top-k <n>', 'Maximum number of chunks to return', '8')
247
+ .option('--chunk-lines <n>', 'Number of lines per retrieval chunk', '40')
248
+ .option('--chunk-overlap <n>', 'Line overlap between retrieval chunks', '10')
249
+ .action(async function actionResearchAsk(query, options) {
250
+ const topK = parsePositiveInt(options.topK, '--top-k');
251
+ const chunkLines = parsePositiveInt(options.chunkLines, '--chunk-lines');
252
+ const chunkOverlap = parseNonNegativeInt(options.chunkOverlap, '--chunk-overlap');
253
+ const requiresSecIdentity = Boolean(options.id);
254
+ const profile = parseResearchProfile(options.profile);
255
+ await executeCommand('research ask', this, io, async (context) => options.id
256
+ ? runResearchAskById({
257
+ id: options.id,
258
+ query,
259
+ profile,
260
+ cacheDir: options.cacheDir,
261
+ refresh: Boolean(options.refresh),
262
+ topK,
263
+ chunkLines,
264
+ chunkOverlap
265
+ }, context)
266
+ : runResearchAsk({
267
+ query,
268
+ docs: options.doc ?? [],
269
+ manifestPath: options.manifest,
270
+ topK,
271
+ chunkLines,
272
+ chunkOverlap
273
+ }, context), { requiresSecIdentity });
274
+ });
212
275
  return program;
213
276
  }
277
+ function collectValues(value, previous) {
278
+ return [...previous, value];
279
+ }
214
280
  export async function runCli(argv, io = defaultIo()) {
215
281
  const program = buildProgram(io);
216
282
  try {
@@ -229,7 +295,19 @@ export async function runCli(argv, io = defaultIo()) {
229
295
  return EXIT_CODE_MAP[cliError.code] ?? 10;
230
296
  }
231
297
  }
232
- if (import.meta.url === pathToFileURL(process.argv[1] ?? '').href) {
298
+ function isDirectExecution() {
299
+ const argvPath = process.argv[1];
300
+ if (!argvPath) {
301
+ return false;
302
+ }
303
+ try {
304
+ return realpathSync(argvPath) === realpathSync(fileURLToPath(import.meta.url));
305
+ }
306
+ catch {
307
+ return false;
308
+ }
309
+ }
310
+ if (isDirectExecution()) {
233
311
  runCli(process.argv.slice(2)).then((exitCode) => {
234
312
  process.exit(exitCode);
235
313
  });
@@ -10,5 +10,5 @@ export declare function runFilingsList(params: {
10
10
  export declare function runFilingsGet(params: {
11
11
  id: string;
12
12
  accession: string;
13
- format: 'url' | 'html' | 'text';
13
+ format: 'url' | 'html' | 'text' | 'markdown';
14
14
  }, context: CommandContext): Promise<CommandResult>;
@@ -1,8 +1,85 @@
1
- import * as cheerio from 'cheerio';
1
+ import TurndownService from 'turndown';
2
+ import { gfm } from '@joplin/turndown-plugin-gfm';
2
3
  import { CLIError, ErrorCode } from '../core/errors.js';
3
4
  import { filingDocumentUrl, submissionsUrl } from '../sec/endpoints.js';
4
5
  import { dateInRange, normalizeAccession } from '../sec/normalizers.js';
5
6
  import { resolveEntity } from '../sec/ticker-map.js';
7
+ function buildMarkdownConverter() {
8
+ const service = new TurndownService({
9
+ headingStyle: 'atx',
10
+ hr: '---',
11
+ bulletListMarker: '-',
12
+ codeBlockStyle: 'fenced',
13
+ fence: '```',
14
+ emDelimiter: '*',
15
+ strongDelimiter: '**',
16
+ linkStyle: 'inlined'
17
+ });
18
+ service.use(gfm);
19
+ service.remove(['script', 'style', 'noscript', 'iframe', 'canvas']);
20
+ return service;
21
+ }
22
+ const markdownConverter = buildMarkdownConverter();
23
+ function stripInlineXbrlHeaders(content) {
24
+ return content
25
+ .replace(/<ix:header[\s\S]*?<\/ix:header>/gi, '')
26
+ .replace(/<ix:hidden[\s\S]*?<\/ix:hidden>/gi, '')
27
+ .replace(/<ix:resources[\s\S]*?<\/ix:resources>/gi, '');
28
+ }
29
+ function splitMarkdownTableCells(line) {
30
+ const trimmed = line.trim();
31
+ const withoutLeadingPipe = trimmed.startsWith('|') ? trimmed.slice(1) : trimmed;
32
+ const withoutTrailingPipe = withoutLeadingPipe.endsWith('|')
33
+ ? withoutLeadingPipe.slice(0, -1)
34
+ : withoutLeadingPipe;
35
+ return withoutTrailingPipe.split('|').map((cell) => cell.trim());
36
+ }
37
+ function isMarkdownTableSeparatorLine(line) {
38
+ const cells = splitMarkdownTableCells(line);
39
+ if (cells.length === 0) {
40
+ return false;
41
+ }
42
+ return cells.every((cell) => /^:?-{3,}:?$/.test(cell.replace(/\s+/g, '')));
43
+ }
44
+ function collapseLayoutTables(markdown) {
45
+ const lines = markdown.split('\n');
46
+ const output = [];
47
+ for (let idx = 0; idx < lines.length; idx += 1) {
48
+ const line = lines[idx];
49
+ if (!line.trimStart().startsWith('|')) {
50
+ output.push(line);
51
+ continue;
52
+ }
53
+ const tableBlock = [line];
54
+ while (idx + 1 < lines.length && lines[idx + 1].trimStart().startsWith('|')) {
55
+ idx += 1;
56
+ tableBlock.push(lines[idx]);
57
+ }
58
+ const hasSeparator = tableBlock.some(isMarkdownTableSeparatorLine);
59
+ if (!hasSeparator) {
60
+ output.push(...tableBlock);
61
+ continue;
62
+ }
63
+ const dataRows = tableBlock.filter((row) => !isMarkdownTableSeparatorLine(row));
64
+ const nonEmptyCellCounts = dataRows.map((row) => splitMarkdownTableCells(row).filter((cell) => cell.length > 0).length);
65
+ const maxNonEmptyCells = Math.max(...nonEmptyCellCounts, 0);
66
+ const avgNonEmptyCells = nonEmptyCellCounts.reduce((sum, count) => sum + count, 0) /
67
+ Math.max(nonEmptyCellCounts.length, 1);
68
+ const isLayoutTable = maxNonEmptyCells <= 1 || avgNonEmptyCells <= 1.2;
69
+ if (!isLayoutTable) {
70
+ output.push(...tableBlock);
71
+ continue;
72
+ }
73
+ const flattenedRows = dataRows
74
+ .map((row) => splitMarkdownTableCells(row).filter((cell) => cell.length > 0).join(' '))
75
+ .map((row) => row.replace(/\s+/g, ' ').trim())
76
+ .filter((row) => row.length > 0);
77
+ if (flattenedRows.length > 0) {
78
+ output.push(...flattenedRows, '');
79
+ }
80
+ }
81
+ return output.join('\n').replace(/\n{3,}/g, '\n\n').trim();
82
+ }
6
83
  function zipRecentFilings(cik, recent) {
7
84
  if (!recent) {
8
85
  return [];
@@ -38,9 +115,17 @@ function zipRecentFilings(cik, recent) {
38
115
  }
39
116
  return rows;
40
117
  }
41
- function extractTextFromHtml(content) {
42
- const $ = cheerio.load(content);
43
- return $.text().replace(/\s+/g, ' ').trim();
118
+ function extractMarkdownFromHtml(content) {
119
+ const sanitizedHtml = stripInlineXbrlHeaders(content);
120
+ const markdown = markdownConverter
121
+ .turndown(sanitizedHtml)
122
+ .replace(/\u00a0/g, ' ')
123
+ .replace(/\r/g, '')
124
+ .replace(/[ \t]+\n/g, '\n')
125
+ .replace(/\n[ \t]+/g, '\n')
126
+ .replace(/\n{3,}/g, '\n\n')
127
+ .trim();
128
+ return collapseLayoutTables(markdown);
44
129
  }
45
130
  export async function runFilingsList(params, context) {
46
131
  const entity = await resolveEntity(params.id, context.secClient, { strictMapMatch: false });
@@ -107,7 +192,7 @@ export async function runFilingsGet(params, context) {
107
192
  data: {
108
193
  accession: match.accession,
109
194
  url: match.filingUrl,
110
- content: extractTextFromHtml(content)
195
+ content: extractMarkdownFromHtml(content)
111
196
  }
112
197
  };
113
198
  }
@@ -0,0 +1,28 @@
1
+ import { CommandContext, CommandResult } from '../core/runtime.js';
2
+ type ResearchProfile = 'core' | 'events' | 'financials';
3
+ export declare function parseResearchProfile(value: string): ResearchProfile;
4
+ export declare function runResearchSync(params: {
5
+ id: string;
6
+ profile: ResearchProfile;
7
+ cacheDir?: string;
8
+ refresh?: boolean;
9
+ }, context: CommandContext): Promise<CommandResult>;
10
+ export declare function runResearchAsk(params: {
11
+ query: string;
12
+ docs: string[];
13
+ manifestPath?: string;
14
+ topK: number;
15
+ chunkLines: number;
16
+ chunkOverlap: number;
17
+ }, context: CommandContext): Promise<CommandResult>;
18
+ export declare function runResearchAskById(params: {
19
+ id: string;
20
+ query: string;
21
+ profile: ResearchProfile;
22
+ cacheDir?: string;
23
+ refresh?: boolean;
24
+ topK: number;
25
+ chunkLines: number;
26
+ chunkOverlap: number;
27
+ }, context: CommandContext): Promise<CommandResult>;
28
+ export {};
@@ -0,0 +1,623 @@
1
+ import { mkdir, readFile, stat, writeFile } from 'node:fs/promises';
2
+ import os from 'node:os';
3
+ import path from 'node:path';
4
+ import { runFilingsGet, runFilingsList } from './filings.js';
5
+ import { CLIError, ErrorCode } from '../core/errors.js';
6
+ import { resolveEntity } from '../sec/ticker-map.js';
7
+ const PROFILE_RULES = {
8
+ core: [
9
+ { form: '10-K', queryLimit: 1 },
10
+ { form: '10-Q', queryLimit: 3 },
11
+ { form: '8-K', queryLimit: 12, recentDays: 180 }
12
+ ],
13
+ events: [{ form: '8-K', queryLimit: 24, recentDays: 365 }],
14
+ financials: [
15
+ { form: '10-K', queryLimit: 2 },
16
+ { form: '10-Q', queryLimit: 6 }
17
+ ]
18
+ };
19
+ function nowIso() {
20
+ return new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
21
+ }
22
+ function formatDateUtc(date) {
23
+ return date.toISOString().slice(0, 10);
24
+ }
25
+ function dateDaysAgo(days) {
26
+ const date = new Date();
27
+ date.setUTCDate(date.getUTCDate() - days);
28
+ return formatDateUtc(date);
29
+ }
30
+ function defaultCacheRoot() {
31
+ if (process.env.EDGAR_CACHE_DIR && process.env.EDGAR_CACHE_DIR.trim().length > 0) {
32
+ return path.resolve(process.env.EDGAR_CACHE_DIR);
33
+ }
34
+ if (process.env.XDG_CACHE_HOME && process.env.XDG_CACHE_HOME.trim().length > 0) {
35
+ return path.resolve(process.env.XDG_CACHE_HOME, 'edgar-cli');
36
+ }
37
+ return path.resolve(os.homedir(), '.cache', 'edgar-cli');
38
+ }
39
+ function resolveCacheRoot(cacheDir) {
40
+ if (cacheDir && cacheDir.trim().length > 0) {
41
+ return path.resolve(cacheDir);
42
+ }
43
+ return defaultCacheRoot();
44
+ }
45
+ function companyCacheDir(cacheRoot, cik) {
46
+ return path.join(cacheRoot, 'research', 'companies', cik);
47
+ }
48
+ function profileManifestPath(cacheRoot, cik, profile) {
49
+ return path.join(companyCacheDir(cacheRoot, cik), 'profiles', `${profile}.json`);
50
+ }
51
+ function filingDocPath(cacheRoot, cik, accession) {
52
+ return path.join(companyCacheDir(cacheRoot, cik), 'filings', `${accession}.md`);
53
+ }
54
+ function parseCachedManifest(value) {
55
+ if (!value || typeof value !== 'object') {
56
+ throw new CLIError(ErrorCode.PARSE_ERROR, 'Cached manifest is malformed');
57
+ }
58
+ const manifest = value;
59
+ if (manifest.version !== 1 ||
60
+ typeof manifest.cik !== 'string' ||
61
+ !Array.isArray(manifest.docs) ||
62
+ !manifest.docs.every((doc) => doc && typeof doc.path === 'string' && typeof doc.accession === 'string')) {
63
+ throw new CLIError(ErrorCode.PARSE_ERROR, 'Cached manifest is malformed');
64
+ }
65
+ return manifest;
66
+ }
67
+ async function readCachedManifest(cacheRoot, cik, profile) {
68
+ const manifestPath = profileManifestPath(cacheRoot, cik, profile);
69
+ let raw;
70
+ try {
71
+ raw = await readFile(manifestPath, 'utf8');
72
+ }
73
+ catch (error) {
74
+ const err = error;
75
+ if (err.code === 'ENOENT') {
76
+ return null;
77
+ }
78
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read cached manifest ${manifestPath}: ${err.message}`);
79
+ }
80
+ let parsed;
81
+ try {
82
+ parsed = JSON.parse(raw);
83
+ }
84
+ catch {
85
+ throw new CLIError(ErrorCode.PARSE_ERROR, `Cached manifest is not valid JSON: ${manifestPath}`);
86
+ }
87
+ return parseCachedManifest(parsed);
88
+ }
89
+ async function writeCachedManifest(cacheRoot, manifest) {
90
+ const manifestPath = profileManifestPath(cacheRoot, manifest.cik, manifest.profile);
91
+ await mkdir(path.dirname(manifestPath), { recursive: true });
92
+ await writeFile(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`, 'utf8');
93
+ return { manifestPath };
94
+ }
95
+ async function fileExists(filePath) {
96
+ try {
97
+ const fileStat = await stat(filePath);
98
+ return fileStat.isFile();
99
+ }
100
+ catch (error) {
101
+ const err = error;
102
+ if (err.code === 'ENOENT') {
103
+ return false;
104
+ }
105
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to stat ${filePath}: ${err.message}`);
106
+ }
107
+ }
108
+ export function parseResearchProfile(value) {
109
+ const normalized = value.trim().toLowerCase();
110
+ if (normalized === 'core' || normalized === 'events' || normalized === 'financials') {
111
+ return normalized;
112
+ }
113
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, '--profile must be one of core|events|financials');
114
+ }
115
+ function tokenize(value) {
116
+ return (value.toLowerCase().match(/[a-z0-9]+/g) ?? []).filter((token) => token.length >= 2);
117
+ }
118
+ const QUERY_STOPWORDS = new Set([
119
+ 'a',
120
+ 'an',
121
+ 'and',
122
+ 'are',
123
+ 'as',
124
+ 'at',
125
+ 'be',
126
+ 'by',
127
+ 'for',
128
+ 'from',
129
+ 'how',
130
+ 'in',
131
+ 'into',
132
+ 'is',
133
+ 'it',
134
+ 'its',
135
+ 'of',
136
+ 'on',
137
+ 'or',
138
+ 'that',
139
+ 'the',
140
+ 'their',
141
+ 'there',
142
+ 'these',
143
+ 'they',
144
+ 'this',
145
+ 'to',
146
+ 'was',
147
+ 'were',
148
+ 'what',
149
+ 'when',
150
+ 'where',
151
+ 'which',
152
+ 'who',
153
+ 'why',
154
+ 'with'
155
+ ]);
156
+ const COVER_BOILERPLATE_PATTERNS = [
157
+ /securities registered pursuant to section 12\(b\)/i,
158
+ /indicate by check mark/i,
159
+ /commission file number/i,
160
+ /for the quarterly period ended/i,
161
+ /for the fiscal year ended/i
162
+ ];
163
+ function uniqueTokens(tokens) {
164
+ return [...new Set(tokens)];
165
+ }
166
+ function buildQueryTerms(query) {
167
+ const rawTokens = tokenize(query);
168
+ const filtered = rawTokens.filter((token) => !QUERY_STOPWORDS.has(token));
169
+ const terms = filtered.length > 0 ? filtered : rawTokens;
170
+ return uniqueTokens(terms);
171
+ }
172
+ function buildQueryBigrams(queryTerms) {
173
+ const bigrams = [];
174
+ for (let idx = 0; idx < queryTerms.length - 1; idx += 1) {
175
+ bigrams.push(`${queryTerms[idx]} ${queryTerms[idx + 1]}`);
176
+ }
177
+ return uniqueTokens(bigrams);
178
+ }
179
+ function countTermHits(queryTerms, termFrequency) {
180
+ return queryTerms.reduce((hits, term) => hits + ((termFrequency.get(term) ?? 0) > 0 ? 1 : 0), 0);
181
+ }
182
+ function countBigramHits(chunkText, queryBigrams) {
183
+ if (queryBigrams.length === 0) {
184
+ return 0;
185
+ }
186
+ const text = chunkText.toLowerCase();
187
+ return queryBigrams.reduce((hits, bigram) => hits + (text.includes(bigram) ? 1 : 0), 0);
188
+ }
189
+ function looksLikeCoverBoilerplate(chunk) {
190
+ if (chunk.lineStart > 140) {
191
+ return false;
192
+ }
193
+ return COVER_BOILERPLATE_PATTERNS.some((pattern) => pattern.test(chunk.text));
194
+ }
195
+ function buildTermFrequency(tokens) {
196
+ const frequency = new Map();
197
+ for (const token of tokens) {
198
+ frequency.set(token, (frequency.get(token) ?? 0) + 1);
199
+ }
200
+ return frequency;
201
+ }
202
+ function extractAccession(docPath) {
203
+ const match = docPath.match(/\d{10}-\d{2}-\d{6}/);
204
+ return match?.[0] ?? null;
205
+ }
206
+ function parseManifest(value) {
207
+ if (Array.isArray(value) && value.every((entry) => typeof entry === 'string')) {
208
+ return { docs: value };
209
+ }
210
+ if (value &&
211
+ typeof value === 'object' &&
212
+ Array.isArray(value.docs) &&
213
+ value.docs.every((entry) => typeof entry === 'string')) {
214
+ return { docs: value.docs };
215
+ }
216
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Manifest must be a JSON array of strings or object with a docs string array');
217
+ }
218
+ async function loadDocPaths(params) {
219
+ const fromOptions = params.docs.map((docPath) => docPath.trim()).filter((docPath) => docPath.length > 0);
220
+ const fromManifest = [];
221
+ if (params.manifestPath) {
222
+ const resolvedManifestPath = path.resolve(params.manifestPath);
223
+ let manifestRaw;
224
+ try {
225
+ manifestRaw = await readFile(resolvedManifestPath, 'utf8');
226
+ }
227
+ catch (error) {
228
+ const err = error;
229
+ if (err.code === 'ENOENT') {
230
+ throw new CLIError(ErrorCode.NOT_FOUND, `Manifest not found: ${resolvedManifestPath}`);
231
+ }
232
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read manifest ${resolvedManifestPath}: ${err.message}`);
233
+ }
234
+ let manifestJson;
235
+ try {
236
+ manifestJson = JSON.parse(manifestRaw);
237
+ }
238
+ catch {
239
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, `Manifest is not valid JSON: ${resolvedManifestPath}`);
240
+ }
241
+ const parsed = parseManifest(manifestJson);
242
+ fromManifest.push(...parsed.docs.map((docPath) => docPath.trim()).filter((docPath) => docPath.length > 0));
243
+ }
244
+ const resolved = [...fromOptions, ...fromManifest].map((docPath) => path.resolve(docPath));
245
+ return [...new Set(resolved)];
246
+ }
247
+ async function ensureReadableTextFile(filePath) {
248
+ let fileStat;
249
+ try {
250
+ fileStat = await stat(filePath);
251
+ }
252
+ catch (error) {
253
+ const err = error;
254
+ if (err.code === 'ENOENT') {
255
+ throw new CLIError(ErrorCode.NOT_FOUND, `Document not found: ${filePath}`);
256
+ }
257
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to stat document ${filePath}: ${err.message}`);
258
+ }
259
+ if (!fileStat.isFile()) {
260
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, `Path is not a file: ${filePath}`);
261
+ }
262
+ let content;
263
+ try {
264
+ content = await readFile(filePath, 'utf8');
265
+ }
266
+ catch (error) {
267
+ const err = error;
268
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read document ${filePath}: ${err.message}`);
269
+ }
270
+ if (content.includes('\u0000')) {
271
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, `File appears to be binary: ${filePath}`);
272
+ }
273
+ return content;
274
+ }
275
+ function chunkDocument(params) {
276
+ const lines = params.content.split(/\r?\n/);
277
+ const step = Math.max(1, params.chunkLines - params.chunkOverlap);
278
+ const chunks = [];
279
+ const accession = extractAccession(params.docPath);
280
+ for (let lineIdx = 0; lineIdx < lines.length; lineIdx += step) {
281
+ const start = lineIdx;
282
+ const endExclusive = Math.min(lines.length, start + params.chunkLines);
283
+ const chunkLines = lines.slice(start, endExclusive);
284
+ const text = chunkLines.join('\n').trim();
285
+ if (text.length === 0) {
286
+ if (endExclusive >= lines.length) {
287
+ break;
288
+ }
289
+ continue;
290
+ }
291
+ const tokens = tokenize(text);
292
+ chunks.push({
293
+ docPath: params.docPath,
294
+ accession,
295
+ lineStart: start + 1,
296
+ lineEnd: endExclusive,
297
+ text,
298
+ tokenCount: tokens.length,
299
+ termFrequency: buildTermFrequency(tokens)
300
+ });
301
+ if (endExclusive >= lines.length) {
302
+ break;
303
+ }
304
+ }
305
+ return chunks;
306
+ }
307
+ function bm25Score(params) {
308
+ const k1 = 1.2;
309
+ const b = 0.75;
310
+ return params.queryTerms.reduce((score, term) => {
311
+ const tf = params.chunk.termFrequency.get(term) ?? 0;
312
+ if (tf === 0) {
313
+ return score;
314
+ }
315
+ const df = params.docFrequencyByTerm.get(term) ?? 0;
316
+ const idf = Math.log(1 + (params.totalChunkCount - df + 0.5) / (df + 0.5));
317
+ const normalizedLength = params.averageChunkLength > 0 ? params.chunk.tokenCount / params.averageChunkLength : 1;
318
+ const denominator = tf + k1 * (1 - b + b * normalizedLength);
319
+ const termScore = idf * ((tf * (k1 + 1)) / denominator);
320
+ return score + termScore;
321
+ }, 0);
322
+ }
323
+ function adjustedChunkScore(params) {
324
+ if (params.baseScore <= 0) {
325
+ return 0;
326
+ }
327
+ const termHits = countTermHits(params.queryTerms, params.chunk.termFrequency);
328
+ if (params.queryTerms.length >= 3 && termHits < 2) {
329
+ return 0;
330
+ }
331
+ const coverage = termHits / Math.max(1, params.queryTerms.length);
332
+ const bigramHits = countBigramHits(params.chunk.text, params.queryBigrams);
333
+ let multiplier = 1;
334
+ if (coverage >= 1) {
335
+ multiplier *= 1.25;
336
+ }
337
+ else if (coverage >= 0.7) {
338
+ multiplier *= 1.15;
339
+ }
340
+ else if (coverage >= 0.5) {
341
+ multiplier *= 1.08;
342
+ }
343
+ else if (params.queryTerms.length >= 3 && coverage <= 0.25) {
344
+ multiplier *= 0.8;
345
+ }
346
+ if (bigramHits > 0) {
347
+ multiplier *= 1 + Math.min(0.24, bigramHits * 0.08);
348
+ }
349
+ if (looksLikeCoverBoilerplate(params.chunk)) {
350
+ multiplier *= 0.45;
351
+ }
352
+ return params.baseScore * multiplier;
353
+ }
354
+ function compactWhitespace(value) {
355
+ return value.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
356
+ }
357
+ function trimExcerpt(value, maxChars) {
358
+ if (value.length <= maxChars) {
359
+ return value;
360
+ }
361
+ return `${value.slice(0, Math.max(0, maxChars - 3)).trimEnd()}...`;
362
+ }
363
+ async function runLexicalSearch(params) {
364
+ const query = params.query.trim();
365
+ if (query.length === 0) {
366
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Query must not be empty');
367
+ }
368
+ if (params.chunkOverlap >= params.chunkLines) {
369
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, '--chunk-overlap must be less than --chunk-lines');
370
+ }
371
+ const docs = await Promise.all(params.docPaths.map(async (docPath) => {
372
+ const content = await ensureReadableTextFile(docPath);
373
+ return {
374
+ path: docPath,
375
+ bytes: Buffer.byteLength(content, 'utf8'),
376
+ lineCount: content.split(/\r?\n/).length,
377
+ chunks: chunkDocument({
378
+ docPath,
379
+ content,
380
+ chunkLines: params.chunkLines,
381
+ chunkOverlap: params.chunkOverlap
382
+ })
383
+ };
384
+ }));
385
+ const allChunks = docs.flatMap((doc) => doc.chunks);
386
+ if (allChunks.length === 0) {
387
+ return {
388
+ data: {
389
+ query,
390
+ backend: 'lexical',
391
+ docs: docs.map((doc) => ({
392
+ path: doc.path,
393
+ bytes: doc.bytes,
394
+ line_count: doc.lineCount
395
+ })),
396
+ result_count: 0,
397
+ results: []
398
+ }
399
+ };
400
+ }
401
+ const queryTerms = buildQueryTerms(query);
402
+ if (queryTerms.length === 0) {
403
+ throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Query must contain at least one alphanumeric token');
404
+ }
405
+ const queryBigrams = buildQueryBigrams(queryTerms);
406
+ const docFrequencyByTerm = new Map();
407
+ for (const term of queryTerms) {
408
+ let count = 0;
409
+ for (const chunk of allChunks) {
410
+ if ((chunk.termFrequency.get(term) ?? 0) > 0) {
411
+ count += 1;
412
+ }
413
+ }
414
+ docFrequencyByTerm.set(term, count);
415
+ }
416
+ const averageChunkLength = allChunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0) / Math.max(allChunks.length, 1);
417
+ const scored = allChunks
418
+ .map((chunk) => {
419
+ const baseScore = bm25Score({
420
+ queryTerms,
421
+ chunk,
422
+ docFrequencyByTerm,
423
+ totalChunkCount: allChunks.length,
424
+ averageChunkLength
425
+ });
426
+ return {
427
+ chunk,
428
+ score: adjustedChunkScore({
429
+ chunk,
430
+ baseScore,
431
+ queryTerms,
432
+ queryBigrams
433
+ })
434
+ };
435
+ })
436
+ .filter((item) => item.score > 0)
437
+ .sort((a, b) => b.score - a.score)
438
+ .slice(0, params.topK);
439
+ return {
440
+ data: {
441
+ query,
442
+ backend: 'lexical',
443
+ query_terms: queryTerms,
444
+ docs: docs.map((doc) => ({
445
+ path: doc.path,
446
+ bytes: doc.bytes,
447
+ line_count: doc.lineCount
448
+ })),
449
+ chunk_count: allChunks.length,
450
+ result_count: scored.length,
451
+ results: scored.map((item, idx) => ({
452
+ rank: idx + 1,
453
+ score: Number(item.score.toFixed(6)),
454
+ path: item.chunk.docPath,
455
+ accession: item.chunk.accession,
456
+ line_start: item.chunk.lineStart,
457
+ line_end: item.chunk.lineEnd,
458
+ excerpt: trimExcerpt(compactWhitespace(item.chunk.text), 1200)
459
+ }))
460
+ }
461
+ };
462
+ }
463
+ export async function runResearchSync(params, context) {
464
+ const entity = await resolveEntity(params.id, context.secClient, { strictMapMatch: false });
465
+ const cacheRoot = resolveCacheRoot(params.cacheDir);
466
+ const rules = PROFILE_RULES[params.profile];
467
+ const selectedByAccession = new Map();
468
+ for (const rule of rules) {
469
+ const listResult = await runFilingsList({
470
+ id: entity.cik,
471
+ form: rule.form,
472
+ from: rule.recentDays ? dateDaysAgo(rule.recentDays) : undefined,
473
+ queryLimit: rule.queryLimit
474
+ }, context);
475
+ const rows = listResult.data;
476
+ for (const row of rows) {
477
+ if (!selectedByAccession.has(row.accession)) {
478
+ selectedByAccession.set(row.accession, row);
479
+ }
480
+ }
481
+ }
482
+ const selectedRows = [...selectedByAccession.values()].sort((a, b) => (b.filingDate ?? '').localeCompare(a.filingDate ?? ''));
483
+ const docs = [];
484
+ const skipped = [];
485
+ let fetchedCount = 0;
486
+ let reusedCount = 0;
487
+ for (const row of selectedRows) {
488
+ const docPath = filingDocPath(cacheRoot, entity.cik, row.accession);
489
+ const shouldUseCache = !params.refresh && (await fileExists(docPath));
490
+ if (!shouldUseCache) {
491
+ try {
492
+ const filingResult = await runFilingsGet({
493
+ id: entity.cik,
494
+ accession: row.accession,
495
+ format: 'markdown'
496
+ }, context);
497
+ const filingData = filingResult.data;
498
+ if (typeof filingData.content !== 'string') {
499
+ throw new CLIError(ErrorCode.PARSE_ERROR, `Unable to parse markdown content for accession ${row.accession}`);
500
+ }
501
+ await mkdir(path.dirname(docPath), { recursive: true });
502
+ const content = filingData.content.endsWith('\n') ? filingData.content : `${filingData.content}\n`;
503
+ await writeFile(docPath, content, 'utf8');
504
+ fetchedCount += 1;
505
+ }
506
+ catch (error) {
507
+ if (error instanceof CLIError && error.code === ErrorCode.NOT_FOUND) {
508
+ skipped.push({ accession: row.accession, reason: error.message });
509
+ continue;
510
+ }
511
+ throw error;
512
+ }
513
+ }
514
+ else {
515
+ reusedCount += 1;
516
+ }
517
+ docs.push({
518
+ accession: row.accession,
519
+ form: row.form,
520
+ filing_date: row.filingDate,
521
+ report_date: row.reportDate,
522
+ filing_url: row.filingUrl,
523
+ path: docPath
524
+ });
525
+ }
526
+ const manifest = {
527
+ version: 1,
528
+ id_input: params.id,
529
+ cik: entity.cik,
530
+ ticker: entity.ticker,
531
+ title: entity.title,
532
+ profile: params.profile,
533
+ synced_at: nowIso(),
534
+ docs
535
+ };
536
+ const { manifestPath } = await writeCachedManifest(cacheRoot, manifest);
537
+ return {
538
+ data: {
539
+ id: params.id,
540
+ cik: entity.cik,
541
+ ticker: entity.ticker,
542
+ title: entity.title,
543
+ profile: params.profile,
544
+ cache_root: cacheRoot,
545
+ manifest_path: manifestPath,
546
+ docs_count: docs.length,
547
+ fetched_count: fetchedCount,
548
+ reused_count: reusedCount,
549
+ skipped_count: skipped.length,
550
+ skipped,
551
+ docs
552
+ }
553
+ };
554
+ }
555
+ export async function runResearchAsk(params, context) {
556
+ void context;
557
+ const docPaths = await loadDocPaths({ docs: params.docs, manifestPath: params.manifestPath });
558
+ if (docPaths.length === 0) {
559
+ throw new CLIError(ErrorCode.DOCS_REQUIRED, 'At least one document is required. Pass --doc <path> and/or --manifest <path>.');
560
+ }
561
+ return runLexicalSearch({
562
+ query: params.query,
563
+ docPaths,
564
+ topK: params.topK,
565
+ chunkLines: params.chunkLines,
566
+ chunkOverlap: params.chunkOverlap
567
+ });
568
+ }
569
+ export async function runResearchAskById(params, context) {
570
+ const cacheRoot = resolveCacheRoot(params.cacheDir);
571
+ const entity = await resolveEntity(params.id, context.secClient, { strictMapMatch: false });
572
+ let manifest = !params.refresh
573
+ ? await readCachedManifest(cacheRoot, entity.cik, params.profile)
574
+ : null;
575
+ let syncData = null;
576
+ if (!manifest || manifest.docs.length === 0) {
577
+ const syncResult = await runResearchSync({
578
+ id: params.id,
579
+ profile: params.profile,
580
+ cacheDir: params.cacheDir,
581
+ refresh: params.refresh
582
+ }, context);
583
+ const syncPayload = syncResult.data;
584
+ syncData = {
585
+ fetched_count: typeof syncPayload.fetched_count === 'number' ? syncPayload.fetched_count : 0,
586
+ reused_count: typeof syncPayload.reused_count === 'number' ? syncPayload.reused_count : 0,
587
+ docs_count: typeof syncPayload.docs_count === 'number' ? syncPayload.docs_count : 0,
588
+ skipped_count: typeof syncPayload.skipped_count === 'number' ? syncPayload.skipped_count : 0
589
+ };
590
+ manifest = await readCachedManifest(cacheRoot, entity.cik, params.profile);
591
+ }
592
+ if (!manifest || manifest.docs.length === 0) {
593
+ throw new CLIError(ErrorCode.DOCS_REQUIRED, `No cached documents found for ${params.id} profile ${params.profile}. Run research sync first.`);
594
+ }
595
+ const docPaths = manifest.docs.map((doc) => doc.path);
596
+ const searchResult = await runLexicalSearch({
597
+ query: params.query,
598
+ docPaths,
599
+ topK: params.topK,
600
+ chunkLines: params.chunkLines,
601
+ chunkOverlap: params.chunkOverlap
602
+ });
603
+ const searchData = searchResult.data;
604
+ return {
605
+ data: {
606
+ ...searchData,
607
+ id: params.id,
608
+ cik: entity.cik,
609
+ ticker: entity.ticker,
610
+ title: entity.title,
611
+ profile: params.profile,
612
+ cache_root: cacheRoot,
613
+ manifest_path: profileManifestPath(cacheRoot, entity.cik, params.profile),
614
+ corpus_docs_count: manifest.docs.length,
615
+ sync: syncData ?? {
616
+ fetched_count: 0,
617
+ reused_count: manifest.docs.length,
618
+ docs_count: manifest.docs.length,
619
+ skipped_count: 0
620
+ }
621
+ }
622
+ };
623
+ }
@@ -1,5 +1,6 @@
1
1
  export declare enum ErrorCode {
2
2
  VALIDATION_ERROR = "VALIDATION_ERROR",
3
+ DOCS_REQUIRED = "DOCS_REQUIRED",
3
4
  IDENTITY_REQUIRED = "IDENTITY_REQUIRED",
4
5
  RATE_LIMITED = "RATE_LIMITED",
5
6
  NOT_FOUND = "NOT_FOUND",
@@ -1,6 +1,7 @@
1
1
  export var ErrorCode;
2
2
  (function (ErrorCode) {
3
3
  ErrorCode["VALIDATION_ERROR"] = "VALIDATION_ERROR";
4
+ ErrorCode["DOCS_REQUIRED"] = "DOCS_REQUIRED";
4
5
  ErrorCode["IDENTITY_REQUIRED"] = "IDENTITY_REQUIRED";
5
6
  ErrorCode["RATE_LIMITED"] = "RATE_LIMITED";
6
7
  ErrorCode["NOT_FOUND"] = "NOT_FOUND";
@@ -10,6 +11,7 @@ export var ErrorCode;
10
11
  })(ErrorCode || (ErrorCode = {}));
11
12
  export const EXIT_CODE_MAP = {
12
13
  [ErrorCode.VALIDATION_ERROR]: 2,
14
+ [ErrorCode.DOCS_REQUIRED]: 2,
13
15
  [ErrorCode.IDENTITY_REQUIRED]: 3,
14
16
  [ErrorCode.RATE_LIMITED]: 4,
15
17
  [ErrorCode.NOT_FOUND]: 5,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "edgar-cli",
3
- "version": "0.1.1",
3
+ "version": "0.1.3",
4
4
  "description": "Agent-friendly SEC EDGAR CLI",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -8,6 +8,7 @@
8
8
  "access": "public"
9
9
  },
10
10
  "bin": {
11
+ "edgar-cli": "dist/cli.js",
11
12
  "edgar": "dist/cli.js"
12
13
  },
13
14
  "files": [
@@ -41,13 +42,15 @@
41
42
  "filings"
42
43
  ],
43
44
  "dependencies": {
44
- "cheerio": "^1.1.2",
45
+ "@joplin/turndown-plugin-gfm": "^1.0.64",
45
46
  "commander": "^14.0.1",
46
47
  "p-limit": "^7.1.1",
48
+ "turndown": "^7.2.2",
47
49
  "zod": "^4.1.5"
48
50
  },
49
51
  "devDependencies": {
50
52
  "@types/node": "^22.13.9",
53
+ "@types/turndown": "^5.0.6",
51
54
  "@typescript-eslint/eslint-plugin": "^8.44.0",
52
55
  "@typescript-eslint/parser": "^8.44.0",
53
56
  "eslint": "^8.57.1",