npm - edgar-cli - Versions diffs - 0.1.1 → 0.1.3 - Mend

edgar-cli 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +30 -0
package/dist/cli.js +85 -7
package/dist/commands/filings.d.ts +1 -1
package/dist/commands/filings.js +90 -5
package/dist/commands/research.d.ts +28 -0
package/dist/commands/research.js +623 -0
package/dist/core/errors.d.ts +1 -0
package/dist/core/errors.js +2 -0
package/package.json +5 -2

package/README.md CHANGED Viewed

@@ -12,6 +12,8 @@ Agent-friendly SEC EDGAR CLI for filings and company facts.
   - `filings list`
   - `filings get`
   - `facts get`
+  - `research sync`
+  - `research ask`
 ## Install / Run
@@ -57,10 +59,38 @@ npx edgar-cli --user-agent "Your Name your.email@example.com" filings list --id
 # Get filing document URL by accession
 npx edgar-cli --user-agent "Your Name your.email@example.com" filings get --id AAPL --accession 0000320193-26-000006 --format url
+# Get filing converted to Markdown
+npx edgar-cli --user-agent "Your Name your.email@example.com" filings get --id AAPL --accession 0000320193-26-000006 --format markdown
 # Get concept data (latest per unit)
 npx edgar-cli --user-agent "Your Name your.email@example.com" facts get --id AAPL --taxonomy us-gaap --concept Revenues --latest
+# Query explicit local docs (repeat --doc or pass --manifest)
+npx edgar-cli research ask "board resignation details" --doc ./cache/nvda-8k.md --top-k 5
+# Build a deterministic cached corpus for a ticker/profile
+npx edgar-cli --user-agent "Your Name your.email@example.com" research sync --id NVDA --profile core
+# Query by ticker against cached corpus (auto-syncs on cache miss)
+npx edgar-cli --user-agent "Your Name your.email@example.com" research ask "what changed on the board?" --id NVDA --profile core
 ```
+## Research Profiles and Cache
+`research sync` and `research ask --id` use deterministic filing profiles:
+- `core`: latest 1x `10-K`, latest 3x `10-Q`, and recent `8-K` (last 180 days, up to 12)
+- `events`: recent `8-K` (last 365 days, up to 24)
+- `financials`: latest 2x `10-K` and latest 6x `10-Q`
+By default, cached corpora are stored in:
+- `$EDGAR_CACHE_DIR` (if set), else
+- `$XDG_CACHE_HOME/edgar-cli` (if set), else
+- `~/.cache/edgar-cli`
+Override per command with `--cache-dir`.
 ## Output Contract (default)
 All JSON-mode commands emit:

package/dist/cli.js CHANGED Viewed

@@ -1,8 +1,10 @@
 #!/usr/bin/env node
+import { realpathSync } from 'node:fs';
 import { Command, CommanderError } from 'commander';
-import { pathToFileURL } from 'node:url';
+import { fileURLToPath } from 'node:url';
 import { runFactsGet } from './commands/facts.js';
 import { runFilingsGet, runFilingsList } from './commands/filings.js';
+import { parseResearchProfile, runResearchAsk, runResearchAskById, runResearchSync } from './commands/research.js';
 import { runResolve } from './commands/resolve.js';
 import { buildRuntimeOptions, parseDateString, parseNonNegativeInt, parsePositiveInt, requireUserAgent } from './core/config.js';
 import { failureEnvelope, successEnvelope } from './core/envelope.js';
@@ -72,7 +74,7 @@ function toCliError(err) {
     }
     return new CLIError(ErrorCode.INTERNAL_ERROR, err.message || 'Unexpected error');
 }
-async function executeCommand(command, commandObj, io, handler) {
+async function executeCommand(command, commandObj, io, handler, options) {
     const globalOptions = commandObj.optsWithGlobals();
     const runtime = buildRuntimeOptions({
         json: globalOptions.json,
@@ -84,7 +86,10 @@ async function executeCommand(command, commandObj, io, handler) {
         userAgent: globalOptions.userAgent
     }, io.env);
     try {
-        const userAgent = requireUserAgent(runtime.userAgent);
+        const requiresSecIdentity = options?.requiresSecIdentity ?? true;
+        const userAgent = requiresSecIdentity
+            ? requireUserAgent(runtime.userAgent)
+            : runtime.userAgent ?? 'edgar-cli local research';
         const secClient = new SecClient({
             userAgent,
             verbose: runtime.verbose,
@@ -164,13 +169,13 @@ export function buildProgram(io) {
         .command('get')
         .requiredOption('--id <id>', 'Ticker or CIK')
         .requiredOption('--accession <accession>', 'Accession number: XXXXXXXXXX-XX-XXXXXX')
-        .option('--format <format>', 'url|html|text', 'url')
+        .option('--format <format>', 'url|html|text|markdown', 'url')
         .action(async function actionFilingsGet(options) {
         const format = options.format;
-        if (!['url', 'html', 'text'].includes(format)) {
+        if (!['url', 'html', 'text', 'markdown'].includes(format)) {
             throw new CLIAbortError(emitError({
                 command: 'filings get',
-                err: new CLIError(ErrorCode.VALIDATION_ERROR, '--format must be one of url|html|text'),
+                err: new CLIError(ErrorCode.VALIDATION_ERROR, '--format must be one of url|html|text|markdown'),
                 runtimeView: 'summary',
                 humanMode: false,
                 io
@@ -209,8 +214,69 @@ export function buildProgram(io) {
             latest: Boolean(options.latest)
         }, context));
     });
+    const research = program
+        .command('research')
+        .description('Run deterministic research workflows over explicit docs or cached filing profiles');
+    research
+        .command('sync')
+        .description('Cache a deterministic research corpus for a company/profile')
+        .requiredOption('--id <id>', 'Ticker or CIK')
+        .option('--profile <profile>', 'core|events|financials', 'core')
+        .option('--cache-dir <path>', 'Override cache directory')
+        .option('--refresh', 'Force refetch even when cached docs exist')
+        .action(async function actionResearchSync(options) {
+        const profile = parseResearchProfile(options.profile);
+        await executeCommand('research sync', this, io, async (context) => runResearchSync({
+            id: options.id,
+            profile,
+            cacheDir: options.cacheDir,
+            refresh: Boolean(options.refresh)
+        }, context), { requiresSecIdentity: true });
+    });
+    research
+        .command('ask')
+        .description('Query explicitly provided local docs, or a cached company profile corpus when --id is used')
+        .argument('<query>', 'Natural language query')
+        .option('--id <id>', 'Ticker or CIK for cached/profile-based research')
+        .option('--profile <profile>', 'core|events|financials (used with --id)', 'core')
+        .option('--cache-dir <path>', 'Override cache directory')
+        .option('--refresh', 'With --id, force refetch of filings before querying')
+        .option('--doc <path>', 'Path to a local document (repeatable)', collectValues, [])
+        .option('--manifest <path>', 'Path to JSON manifest: either ["doc1", ...] or {"docs": ["doc1", ...]}')
+        .option('--top-k <n>', 'Maximum number of chunks to return', '8')
+        .option('--chunk-lines <n>', 'Number of lines per retrieval chunk', '40')
+        .option('--chunk-overlap <n>', 'Line overlap between retrieval chunks', '10')
+        .action(async function actionResearchAsk(query, options) {
+        const topK = parsePositiveInt(options.topK, '--top-k');
+        const chunkLines = parsePositiveInt(options.chunkLines, '--chunk-lines');
+        const chunkOverlap = parseNonNegativeInt(options.chunkOverlap, '--chunk-overlap');
+        const requiresSecIdentity = Boolean(options.id);
+        const profile = parseResearchProfile(options.profile);
+        await executeCommand('research ask', this, io, async (context) => options.id
+            ? runResearchAskById({
+                id: options.id,
+                query,
+                profile,
+                cacheDir: options.cacheDir,
+                refresh: Boolean(options.refresh),
+                topK,
+                chunkLines,
+                chunkOverlap
+            }, context)
+            : runResearchAsk({
+                query,
+                docs: options.doc ?? [],
+                manifestPath: options.manifest,
+                topK,
+                chunkLines,
+                chunkOverlap
+            }, context), { requiresSecIdentity });
+    });
     return program;
 }
+function collectValues(value, previous) {
+    return [...previous, value];
+}
 export async function runCli(argv, io = defaultIo()) {
     const program = buildProgram(io);
     try {
@@ -229,7 +295,19 @@ export async function runCli(argv, io = defaultIo()) {
         return EXIT_CODE_MAP[cliError.code] ?? 10;
     }
 }
-if (import.meta.url === pathToFileURL(process.argv[1] ?? '').href) {
+function isDirectExecution() {
+    const argvPath = process.argv[1];
+    if (!argvPath) {
+        return false;
+    }
+    try {
+        return realpathSync(argvPath) === realpathSync(fileURLToPath(import.meta.url));
+    }
+    catch {
+        return false;
+    }
+}
+if (isDirectExecution()) {
     runCli(process.argv.slice(2)).then((exitCode) => {
         process.exit(exitCode);
     });

package/dist/commands/filings.d.ts CHANGED Viewed

@@ -10,5 +10,5 @@ export declare function runFilingsList(params: {
 export declare function runFilingsGet(params: {
     id: string;
     accession: string;
-    format: 'url' | 'html' | 'text';
+    format: 'url' | 'html' | 'text' | 'markdown';
 }, context: CommandContext): Promise<CommandResult>;

package/dist/commands/filings.js CHANGED Viewed

@@ -1,8 +1,85 @@
-import * as cheerio from 'cheerio';
+import TurndownService from 'turndown';
+import { gfm } from '@joplin/turndown-plugin-gfm';
 import { CLIError, ErrorCode } from '../core/errors.js';
 import { filingDocumentUrl, submissionsUrl } from '../sec/endpoints.js';
 import { dateInRange, normalizeAccession } from '../sec/normalizers.js';
 import { resolveEntity } from '../sec/ticker-map.js';
+function buildMarkdownConverter() {
+    const service = new TurndownService({
+        headingStyle: 'atx',
+        hr: '---',
+        bulletListMarker: '-',
+        codeBlockStyle: 'fenced',
+        fence: '```',
+        emDelimiter: '*',
+        strongDelimiter: '**',
+        linkStyle: 'inlined'
+    });
+    service.use(gfm);
+    service.remove(['script', 'style', 'noscript', 'iframe', 'canvas']);
+    return service;
+}
+const markdownConverter = buildMarkdownConverter();
+function stripInlineXbrlHeaders(content) {
+    return content
+        .replace(/<ix:header[\s\S]*?<\/ix:header>/gi, '')
+        .replace(/<ix:hidden[\s\S]*?<\/ix:hidden>/gi, '')
+        .replace(/<ix:resources[\s\S]*?<\/ix:resources>/gi, '');
+}
+function splitMarkdownTableCells(line) {
+    const trimmed = line.trim();
+    const withoutLeadingPipe = trimmed.startsWith('|') ? trimmed.slice(1) : trimmed;
+    const withoutTrailingPipe = withoutLeadingPipe.endsWith('|')
+        ? withoutLeadingPipe.slice(0, -1)
+        : withoutLeadingPipe;
+    return withoutTrailingPipe.split('|').map((cell) => cell.trim());
+}
+function isMarkdownTableSeparatorLine(line) {
+    const cells = splitMarkdownTableCells(line);
+    if (cells.length === 0) {
+        return false;
+    }
+    return cells.every((cell) => /^:?-{3,}:?$/.test(cell.replace(/\s+/g, '')));
+}
+function collapseLayoutTables(markdown) {
+    const lines = markdown.split('\n');
+    const output = [];
+    for (let idx = 0; idx < lines.length; idx += 1) {
+        const line = lines[idx];
+        if (!line.trimStart().startsWith('|')) {
+            output.push(line);
+            continue;
+        }
+        const tableBlock = [line];
+        while (idx + 1 < lines.length && lines[idx + 1].trimStart().startsWith('|')) {
+            idx += 1;
+            tableBlock.push(lines[idx]);
+        }
+        const hasSeparator = tableBlock.some(isMarkdownTableSeparatorLine);
+        if (!hasSeparator) {
+            output.push(...tableBlock);
+            continue;
+        }
+        const dataRows = tableBlock.filter((row) => !isMarkdownTableSeparatorLine(row));
+        const nonEmptyCellCounts = dataRows.map((row) => splitMarkdownTableCells(row).filter((cell) => cell.length > 0).length);
+        const maxNonEmptyCells = Math.max(...nonEmptyCellCounts, 0);
+        const avgNonEmptyCells = nonEmptyCellCounts.reduce((sum, count) => sum + count, 0) /
+            Math.max(nonEmptyCellCounts.length, 1);
+        const isLayoutTable = maxNonEmptyCells <= 1 || avgNonEmptyCells <= 1.2;
+        if (!isLayoutTable) {
+            output.push(...tableBlock);
+            continue;
+        }
+        const flattenedRows = dataRows
+            .map((row) => splitMarkdownTableCells(row).filter((cell) => cell.length > 0).join(' '))
+            .map((row) => row.replace(/\s+/g, ' ').trim())
+            .filter((row) => row.length > 0);
+        if (flattenedRows.length > 0) {
+            output.push(...flattenedRows, '');
+        }
+    }
+    return output.join('\n').replace(/\n{3,}/g, '\n\n').trim();
+}
 function zipRecentFilings(cik, recent) {
     if (!recent) {
         return [];
@@ -38,9 +115,17 @@ function zipRecentFilings(cik, recent) {
     }
     return rows;
 }
-function extractTextFromHtml(content) {
-    const $ = cheerio.load(content);
-    return $.text().replace(/\s+/g, ' ').trim();
+function extractMarkdownFromHtml(content) {
+    const sanitizedHtml = stripInlineXbrlHeaders(content);
+    const markdown = markdownConverter
+        .turndown(sanitizedHtml)
+        .replace(/\u00a0/g, ' ')
+        .replace(/\r/g, '')
+        .replace(/[ \t]+\n/g, '\n')
+        .replace(/\n[ \t]+/g, '\n')
+        .replace(/\n{3,}/g, '\n\n')
+        .trim();
+    return collapseLayoutTables(markdown);
 }
 export async function runFilingsList(params, context) {
     const entity = await resolveEntity(params.id, context.secClient, { strictMapMatch: false });
@@ -107,7 +192,7 @@ export async function runFilingsGet(params, context) {
         data: {
             accession: match.accession,
             url: match.filingUrl,
-            content: extractTextFromHtml(content)
+            content: extractMarkdownFromHtml(content)
         }
     };
 }

package/dist/commands/research.d.ts ADDED Viewed

@@ -0,0 +1,28 @@
+import { CommandContext, CommandResult } from '../core/runtime.js';
+type ResearchProfile = 'core' | 'events' | 'financials';
+export declare function parseResearchProfile(value: string): ResearchProfile;
+export declare function runResearchSync(params: {
+    id: string;
+    profile: ResearchProfile;
+    cacheDir?: string;
+    refresh?: boolean;
+}, context: CommandContext): Promise<CommandResult>;
+export declare function runResearchAsk(params: {
+    query: string;
+    docs: string[];
+    manifestPath?: string;
+    topK: number;
+    chunkLines: number;
+    chunkOverlap: number;
+}, context: CommandContext): Promise<CommandResult>;
+export declare function runResearchAskById(params: {
+    id: string;
+    query: string;
+    profile: ResearchProfile;
+    cacheDir?: string;
+    refresh?: boolean;
+    topK: number;
+    chunkLines: number;
+    chunkOverlap: number;
+}, context: CommandContext): Promise<CommandResult>;
+export {};

package/dist/commands/research.js ADDED Viewed

@@ -0,0 +1,623 @@
+import { mkdir, readFile, stat, writeFile } from 'node:fs/promises';
+import os from 'node:os';
+import path from 'node:path';
+import { runFilingsGet, runFilingsList } from './filings.js';
+import { CLIError, ErrorCode } from '../core/errors.js';
+import { resolveEntity } from '../sec/ticker-map.js';
+const PROFILE_RULES = {
+    core: [
+        { form: '10-K', queryLimit: 1 },
+        { form: '10-Q', queryLimit: 3 },
+        { form: '8-K', queryLimit: 12, recentDays: 180 }
+    ],
+    events: [{ form: '8-K', queryLimit: 24, recentDays: 365 }],
+    financials: [
+        { form: '10-K', queryLimit: 2 },
+        { form: '10-Q', queryLimit: 6 }
+    ]
+};
+function nowIso() {
+    return new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
+}
+function formatDateUtc(date) {
+    return date.toISOString().slice(0, 10);
+}
+function dateDaysAgo(days) {
+    const date = new Date();
+    date.setUTCDate(date.getUTCDate() - days);
+    return formatDateUtc(date);
+}
+function defaultCacheRoot() {
+    if (process.env.EDGAR_CACHE_DIR && process.env.EDGAR_CACHE_DIR.trim().length > 0) {
+        return path.resolve(process.env.EDGAR_CACHE_DIR);
+    }
+    if (process.env.XDG_CACHE_HOME && process.env.XDG_CACHE_HOME.trim().length > 0) {
+        return path.resolve(process.env.XDG_CACHE_HOME, 'edgar-cli');
+    }
+    return path.resolve(os.homedir(), '.cache', 'edgar-cli');
+}
+function resolveCacheRoot(cacheDir) {
+    if (cacheDir && cacheDir.trim().length > 0) {
+        return path.resolve(cacheDir);
+    }
+    return defaultCacheRoot();
+}
+function companyCacheDir(cacheRoot, cik) {
+    return path.join(cacheRoot, 'research', 'companies', cik);
+}
+function profileManifestPath(cacheRoot, cik, profile) {
+    return path.join(companyCacheDir(cacheRoot, cik), 'profiles', `${profile}.json`);
+}
+function filingDocPath(cacheRoot, cik, accession) {
+    return path.join(companyCacheDir(cacheRoot, cik), 'filings', `${accession}.md`);
+}
+function parseCachedManifest(value) {
+    if (!value || typeof value !== 'object') {
+        throw new CLIError(ErrorCode.PARSE_ERROR, 'Cached manifest is malformed');
+    }
+    const manifest = value;
+    if (manifest.version !== 1 ||
+        typeof manifest.cik !== 'string' ||
+        !Array.isArray(manifest.docs) ||
+        !manifest.docs.every((doc) => doc && typeof doc.path === 'string' && typeof doc.accession === 'string')) {
+        throw new CLIError(ErrorCode.PARSE_ERROR, 'Cached manifest is malformed');
+    }
+    return manifest;
+}
+async function readCachedManifest(cacheRoot, cik, profile) {
+    const manifestPath = profileManifestPath(cacheRoot, cik, profile);
+    let raw;
+    try {
+        raw = await readFile(manifestPath, 'utf8');
+    }
+    catch (error) {
+        const err = error;
+        if (err.code === 'ENOENT') {
+            return null;
+        }
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read cached manifest ${manifestPath}: ${err.message}`);
+    }
+    let parsed;
+    try {
+        parsed = JSON.parse(raw);
+    }
+    catch {
+        throw new CLIError(ErrorCode.PARSE_ERROR, `Cached manifest is not valid JSON: ${manifestPath}`);
+    }
+    return parseCachedManifest(parsed);
+}
+async function writeCachedManifest(cacheRoot, manifest) {
+    const manifestPath = profileManifestPath(cacheRoot, manifest.cik, manifest.profile);
+    await mkdir(path.dirname(manifestPath), { recursive: true });
+    await writeFile(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`, 'utf8');
+    return { manifestPath };
+}
+async function fileExists(filePath) {
+    try {
+        const fileStat = await stat(filePath);
+        return fileStat.isFile();
+    }
+    catch (error) {
+        const err = error;
+        if (err.code === 'ENOENT') {
+            return false;
+        }
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to stat ${filePath}: ${err.message}`);
+    }
+}
+export function parseResearchProfile(value) {
+    const normalized = value.trim().toLowerCase();
+    if (normalized === 'core' || normalized === 'events' || normalized === 'financials') {
+        return normalized;
+    }
+    throw new CLIError(ErrorCode.VALIDATION_ERROR, '--profile must be one of core|events|financials');
+}
+function tokenize(value) {
+    return (value.toLowerCase().match(/[a-z0-9]+/g) ?? []).filter((token) => token.length >= 2);
+}
+const QUERY_STOPWORDS = new Set([
+    'a',
+    'an',
+    'and',
+    'are',
+    'as',
+    'at',
+    'be',
+    'by',
+    'for',
+    'from',
+    'how',
+    'in',
+    'into',
+    'is',
+    'it',
+    'its',
+    'of',
+    'on',
+    'or',
+    'that',
+    'the',
+    'their',
+    'there',
+    'these',
+    'they',
+    'this',
+    'to',
+    'was',
+    'were',
+    'what',
+    'when',
+    'where',
+    'which',
+    'who',
+    'why',
+    'with'
+]);
+const COVER_BOILERPLATE_PATTERNS = [
+    /securities registered pursuant to section 12\(b\)/i,
+    /indicate by check mark/i,
+    /commission file number/i,
+    /for the quarterly period ended/i,
+    /for the fiscal year ended/i
+];
+function uniqueTokens(tokens) {
+    return [...new Set(tokens)];
+}
+function buildQueryTerms(query) {
+    const rawTokens = tokenize(query);
+    const filtered = rawTokens.filter((token) => !QUERY_STOPWORDS.has(token));
+    const terms = filtered.length > 0 ? filtered : rawTokens;
+    return uniqueTokens(terms);
+}
+function buildQueryBigrams(queryTerms) {
+    const bigrams = [];
+    for (let idx = 0; idx < queryTerms.length - 1; idx += 1) {
+        bigrams.push(`${queryTerms[idx]} ${queryTerms[idx + 1]}`);
+    }
+    return uniqueTokens(bigrams);
+}
+function countTermHits(queryTerms, termFrequency) {
+    return queryTerms.reduce((hits, term) => hits + ((termFrequency.get(term) ?? 0) > 0 ? 1 : 0), 0);
+}
+function countBigramHits(chunkText, queryBigrams) {
+    if (queryBigrams.length === 0) {
+        return 0;
+    }
+    const text = chunkText.toLowerCase();
+    return queryBigrams.reduce((hits, bigram) => hits + (text.includes(bigram) ? 1 : 0), 0);
+}
+function looksLikeCoverBoilerplate(chunk) {
+    if (chunk.lineStart > 140) {
+        return false;
+    }
+    return COVER_BOILERPLATE_PATTERNS.some((pattern) => pattern.test(chunk.text));
+}
+function buildTermFrequency(tokens) {
+    const frequency = new Map();
+    for (const token of tokens) {
+        frequency.set(token, (frequency.get(token) ?? 0) + 1);
+    }
+    return frequency;
+}
+function extractAccession(docPath) {
+    const match = docPath.match(/\d{10}-\d{2}-\d{6}/);
+    return match?.[0] ?? null;
+}
+function parseManifest(value) {
+    if (Array.isArray(value) && value.every((entry) => typeof entry === 'string')) {
+        return { docs: value };
+    }
+    if (value &&
+        typeof value === 'object' &&
+        Array.isArray(value.docs) &&
+        value.docs.every((entry) => typeof entry === 'string')) {
+        return { docs: value.docs };
+    }
+    throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Manifest must be a JSON array of strings or object with a docs string array');
+}
+async function loadDocPaths(params) {
+    const fromOptions = params.docs.map((docPath) => docPath.trim()).filter((docPath) => docPath.length > 0);
+    const fromManifest = [];
+    if (params.manifestPath) {
+        const resolvedManifestPath = path.resolve(params.manifestPath);
+        let manifestRaw;
+        try {
+            manifestRaw = await readFile(resolvedManifestPath, 'utf8');
+        }
+        catch (error) {
+            const err = error;
+            if (err.code === 'ENOENT') {
+                throw new CLIError(ErrorCode.NOT_FOUND, `Manifest not found: ${resolvedManifestPath}`);
+            }
+            throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read manifest ${resolvedManifestPath}: ${err.message}`);
+        }
+        let manifestJson;
+        try {
+            manifestJson = JSON.parse(manifestRaw);
+        }
+        catch {
+            throw new CLIError(ErrorCode.VALIDATION_ERROR, `Manifest is not valid JSON: ${resolvedManifestPath}`);
+        }
+        const parsed = parseManifest(manifestJson);
+        fromManifest.push(...parsed.docs.map((docPath) => docPath.trim()).filter((docPath) => docPath.length > 0));
+    }
+    const resolved = [...fromOptions, ...fromManifest].map((docPath) => path.resolve(docPath));
+    return [...new Set(resolved)];
+}
+async function ensureReadableTextFile(filePath) {
+    let fileStat;
+    try {
+        fileStat = await stat(filePath);
+    }
+    catch (error) {
+        const err = error;
+        if (err.code === 'ENOENT') {
+            throw new CLIError(ErrorCode.NOT_FOUND, `Document not found: ${filePath}`);
+        }
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to stat document ${filePath}: ${err.message}`);
+    }
+    if (!fileStat.isFile()) {
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, `Path is not a file: ${filePath}`);
+    }
+    let content;
+    try {
+        content = await readFile(filePath, 'utf8');
+    }
+    catch (error) {
+        const err = error;
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, `Unable to read document ${filePath}: ${err.message}`);
+    }
+    if (content.includes('\u0000')) {
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, `File appears to be binary: ${filePath}`);
+    }
+    return content;
+}
+function chunkDocument(params) {
+    const lines = params.content.split(/\r?\n/);
+    const step = Math.max(1, params.chunkLines - params.chunkOverlap);
+    const chunks = [];
+    const accession = extractAccession(params.docPath);
+    for (let lineIdx = 0; lineIdx < lines.length; lineIdx += step) {
+        const start = lineIdx;
+        const endExclusive = Math.min(lines.length, start + params.chunkLines);
+        const chunkLines = lines.slice(start, endExclusive);
+        const text = chunkLines.join('\n').trim();
+        if (text.length === 0) {
+            if (endExclusive >= lines.length) {
+                break;
+            }
+            continue;
+        }
+        const tokens = tokenize(text);
+        chunks.push({
+            docPath: params.docPath,
+            accession,
+            lineStart: start + 1,
+            lineEnd: endExclusive,
+            text,
+            tokenCount: tokens.length,
+            termFrequency: buildTermFrequency(tokens)
+        });
+        if (endExclusive >= lines.length) {
+            break;
+        }
+    }
+    return chunks;
+}
+function bm25Score(params) {
+    const k1 = 1.2;
+    const b = 0.75;
+    return params.queryTerms.reduce((score, term) => {
+        const tf = params.chunk.termFrequency.get(term) ?? 0;
+        if (tf === 0) {
+            return score;
+        }
+        const df = params.docFrequencyByTerm.get(term) ?? 0;
+        const idf = Math.log(1 + (params.totalChunkCount - df + 0.5) / (df + 0.5));
+        const normalizedLength = params.averageChunkLength > 0 ? params.chunk.tokenCount / params.averageChunkLength : 1;
+        const denominator = tf + k1 * (1 - b + b * normalizedLength);
+        const termScore = idf * ((tf * (k1 + 1)) / denominator);
+        return score + termScore;
+    }, 0);
+}
+function adjustedChunkScore(params) {
+    if (params.baseScore <= 0) {
+        return 0;
+    }
+    const termHits = countTermHits(params.queryTerms, params.chunk.termFrequency);
+    if (params.queryTerms.length >= 3 && termHits < 2) {
+        return 0;
+    }
+    const coverage = termHits / Math.max(1, params.queryTerms.length);
+    const bigramHits = countBigramHits(params.chunk.text, params.queryBigrams);
+    let multiplier = 1;
+    if (coverage >= 1) {
+        multiplier *= 1.25;
+    }
+    else if (coverage >= 0.7) {
+        multiplier *= 1.15;
+    }
+    else if (coverage >= 0.5) {
+        multiplier *= 1.08;
+    }
+    else if (params.queryTerms.length >= 3 && coverage <= 0.25) {
+        multiplier *= 0.8;
+    }
+    if (bigramHits > 0) {
+        multiplier *= 1 + Math.min(0.24, bigramHits * 0.08);
+    }
+    if (looksLikeCoverBoilerplate(params.chunk)) {
+        multiplier *= 0.45;
+    }
+    return params.baseScore * multiplier;
+}
+function compactWhitespace(value) {
+    return value.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
+}
+function trimExcerpt(value, maxChars) {
+    if (value.length <= maxChars) {
+        return value;
+    }
+    return `${value.slice(0, Math.max(0, maxChars - 3)).trimEnd()}...`;
+}
+async function runLexicalSearch(params) {
+    const query = params.query.trim();
+    if (query.length === 0) {
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Query must not be empty');
+    }
+    if (params.chunkOverlap >= params.chunkLines) {
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, '--chunk-overlap must be less than --chunk-lines');
+    }
+    const docs = await Promise.all(params.docPaths.map(async (docPath) => {
+        const content = await ensureReadableTextFile(docPath);
+        return {
+            path: docPath,
+            bytes: Buffer.byteLength(content, 'utf8'),
+            lineCount: content.split(/\r?\n/).length,
+            chunks: chunkDocument({
+                docPath,
+                content,
+                chunkLines: params.chunkLines,
+                chunkOverlap: params.chunkOverlap
+            })
+        };
+    }));
+    const allChunks = docs.flatMap((doc) => doc.chunks);
+    if (allChunks.length === 0) {
+        return {
+            data: {
+                query,
+                backend: 'lexical',
+                docs: docs.map((doc) => ({
+                    path: doc.path,
+                    bytes: doc.bytes,
+                    line_count: doc.lineCount
+                })),
+                result_count: 0,
+                results: []
+            }
+        };
+    }
+    const queryTerms = buildQueryTerms(query);
+    if (queryTerms.length === 0) {
+        throw new CLIError(ErrorCode.VALIDATION_ERROR, 'Query must contain at least one alphanumeric token');
+    }
+    const queryBigrams = buildQueryBigrams(queryTerms);
+    const docFrequencyByTerm = new Map();
+    for (const term of queryTerms) {
+        let count = 0;
+        for (const chunk of allChunks) {
+            if ((chunk.termFrequency.get(term) ?? 0) > 0) {
+                count += 1;
+            }
+        }
+        docFrequencyByTerm.set(term, count);
+    }
+    const averageChunkLength = allChunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0) / Math.max(allChunks.length, 1);
+    const scored = allChunks
+        .map((chunk) => {
+        const baseScore = bm25Score({
+            queryTerms,
+            chunk,
+            docFrequencyByTerm,
+            totalChunkCount: allChunks.length,
+            averageChunkLength
+        });
+        return {
+            chunk,
+            score: adjustedChunkScore({
+                chunk,
+                baseScore,
+                queryTerms,
+                queryBigrams
+            })
+        };
+    })
+        .filter((item) => item.score > 0)
+        .sort((a, b) => b.score - a.score)
+        .slice(0, params.topK);
+    return {
+        data: {
+            query,
+            backend: 'lexical',
+            query_terms: queryTerms,
+            docs: docs.map((doc) => ({
+                path: doc.path,
+                bytes: doc.bytes,
+                line_count: doc.lineCount
+            })),
+            chunk_count: allChunks.length,
+            result_count: scored.length,
+            results: scored.map((item, idx) => ({
+                rank: idx + 1,
+                score: Number(item.score.toFixed(6)),
+                path: item.chunk.docPath,
+                accession: item.chunk.accession,
+                line_start: item.chunk.lineStart,
+                line_end: item.chunk.lineEnd,
+                excerpt: trimExcerpt(compactWhitespace(item.chunk.text), 1200)
+            }))
+        }
+    };
+}
+export async function runResearchSync(params, context) {
+    const entity = await resolveEntity(params.id, context.secClient, { strictMapMatch: false });
+    const cacheRoot = resolveCacheRoot(params.cacheDir);
+    const rules = PROFILE_RULES[params.profile];
+    const selectedByAccession = new Map();
+    for (const rule of rules) {
+        const listResult = await runFilingsList({
+            id: entity.cik,
+            form: rule.form,
+            from: rule.recentDays ? dateDaysAgo(rule.recentDays) : undefined,
+            queryLimit: rule.queryLimit
+        }, context);
+        const rows = listResult.data;
+        for (const row of rows) {
+            if (!selectedByAccession.has(row.accession)) {
+                selectedByAccession.set(row.accession, row);
+            }
+        }
+    }
+    const selectedRows = [...selectedByAccession.values()].sort((a, b) => (b.filingDate ?? '').localeCompare(a.filingDate ?? ''));
+    const docs = [];
+    const skipped = [];
+    let fetchedCount = 0;
+    let reusedCount = 0;
+    for (const row of selectedRows) {
+        const docPath = filingDocPath(cacheRoot, entity.cik, row.accession);
+        const shouldUseCache = !params.refresh && (await fileExists(docPath));
+        if (!shouldUseCache) {
+            try {
+                const filingResult = await runFilingsGet({
+                    id: entity.cik,
+                    accession: row.accession,
+                    format: 'markdown'
+                }, context);
+                const filingData = filingResult.data;
+                if (typeof filingData.content !== 'string') {
+                    throw new CLIError(ErrorCode.PARSE_ERROR, `Unable to parse markdown content for accession ${row.accession}`);
+                }
+                await mkdir(path.dirname(docPath), { recursive: true });
+                const content = filingData.content.endsWith('\n') ? filingData.content : `${filingData.content}\n`;
+                await writeFile(docPath, content, 'utf8');
+                fetchedCount += 1;
+            }
+            catch (error) {
+                if (error instanceof CLIError && error.code === ErrorCode.NOT_FOUND) {
+                    skipped.push({ accession: row.accession, reason: error.message });
+                    continue;
+                }
+                throw error;
+            }
+        }
+        else {
+            reusedCount += 1;
+        }
+        docs.push({
+            accession: row.accession,
+            form: row.form,
+            filing_date: row.filingDate,
+            report_date: row.reportDate,
+            filing_url: row.filingUrl,
+            path: docPath
+        });
+    }
+    const manifest = {
+        version: 1,
+        id_input: params.id,
+        cik: entity.cik,
+        ticker: entity.ticker,
+        title: entity.title,
+        profile: params.profile,
+        synced_at: nowIso(),
+        docs
+    };
+    const { manifestPath } = await writeCachedManifest(cacheRoot, manifest);
+    return {
+        data: {
+            id: params.id,
+            cik: entity.cik,
+            ticker: entity.ticker,
+            title: entity.title,
+            profile: params.profile,
+            cache_root: cacheRoot,
+            manifest_path: manifestPath,
+            docs_count: docs.length,
+            fetched_count: fetchedCount,
+            reused_count: reusedCount,
+            skipped_count: skipped.length,
+            skipped,
+            docs
+        }
+    };
+}
+export async function runResearchAsk(params, context) {
+    void context;
+    const docPaths = await loadDocPaths({ docs: params.docs, manifestPath: params.manifestPath });
+    if (docPaths.length === 0) {
+        throw new CLIError(ErrorCode.DOCS_REQUIRED, 'At least one document is required. Pass --doc <path> and/or --manifest <path>.');
+    }
+    return runLexicalSearch({
+        query: params.query,
+        docPaths,
+        topK: params.topK,
+        chunkLines: params.chunkLines,
+        chunkOverlap: params.chunkOverlap
+    });
+}
+export async function runResearchAskById(params, context) {
+    const cacheRoot = resolveCacheRoot(params.cacheDir);
+    const entity = await resolveEntity(params.id, context.secClient, { strictMapMatch: false });
+    let manifest = !params.refresh
+        ? await readCachedManifest(cacheRoot, entity.cik, params.profile)
+        : null;
+    let syncData = null;
+    if (!manifest || manifest.docs.length === 0) {
+        const syncResult = await runResearchSync({
+            id: params.id,
+            profile: params.profile,
+            cacheDir: params.cacheDir,
+            refresh: params.refresh
+        }, context);
+        const syncPayload = syncResult.data;
+        syncData = {
+            fetched_count: typeof syncPayload.fetched_count === 'number' ? syncPayload.fetched_count : 0,
+            reused_count: typeof syncPayload.reused_count === 'number' ? syncPayload.reused_count : 0,
+            docs_count: typeof syncPayload.docs_count === 'number' ? syncPayload.docs_count : 0,
+            skipped_count: typeof syncPayload.skipped_count === 'number' ? syncPayload.skipped_count : 0
+        };
+        manifest = await readCachedManifest(cacheRoot, entity.cik, params.profile);
+    }
+    if (!manifest || manifest.docs.length === 0) {
+        throw new CLIError(ErrorCode.DOCS_REQUIRED, `No cached documents found for ${params.id} profile ${params.profile}. Run research sync first.`);
+    }
+    const docPaths = manifest.docs.map((doc) => doc.path);
+    const searchResult = await runLexicalSearch({
+        query: params.query,
+        docPaths,
+        topK: params.topK,
+        chunkLines: params.chunkLines,
+        chunkOverlap: params.chunkOverlap
+    });
+    const searchData = searchResult.data;
+    return {
+        data: {
+            ...searchData,
+            id: params.id,
+            cik: entity.cik,
+            ticker: entity.ticker,
+            title: entity.title,
+            profile: params.profile,
+            cache_root: cacheRoot,
+            manifest_path: profileManifestPath(cacheRoot, entity.cik, params.profile),
+            corpus_docs_count: manifest.docs.length,
+            sync: syncData ?? {
+                fetched_count: 0,
+                reused_count: manifest.docs.length,
+                docs_count: manifest.docs.length,
+                skipped_count: 0
+            }
+        }
+    };
+}

package/dist/core/errors.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 export declare enum ErrorCode {
     VALIDATION_ERROR = "VALIDATION_ERROR",
+    DOCS_REQUIRED = "DOCS_REQUIRED",
     IDENTITY_REQUIRED = "IDENTITY_REQUIRED",
     RATE_LIMITED = "RATE_LIMITED",
     NOT_FOUND = "NOT_FOUND",

package/dist/core/errors.js CHANGED Viewed

@@ -1,6 +1,7 @@
 export var ErrorCode;
 (function (ErrorCode) {
     ErrorCode["VALIDATION_ERROR"] = "VALIDATION_ERROR";
+    ErrorCode["DOCS_REQUIRED"] = "DOCS_REQUIRED";
     ErrorCode["IDENTITY_REQUIRED"] = "IDENTITY_REQUIRED";
     ErrorCode["RATE_LIMITED"] = "RATE_LIMITED";
     ErrorCode["NOT_FOUND"] = "NOT_FOUND";
@@ -10,6 +11,7 @@ export var ErrorCode;
 })(ErrorCode || (ErrorCode = {}));
 export const EXIT_CODE_MAP = {
     [ErrorCode.VALIDATION_ERROR]: 2,
+    [ErrorCode.DOCS_REQUIRED]: 2,
     [ErrorCode.IDENTITY_REQUIRED]: 3,
     [ErrorCode.RATE_LIMITED]: 4,
     [ErrorCode.NOT_FOUND]: 5,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "edgar-cli",
-  "version": "0.1.1",
+  "version": "0.1.3",
   "description": "Agent-friendly SEC EDGAR CLI",
   "license": "MIT",
   "type": "module",
@@ -8,6 +8,7 @@
     "access": "public"
   },
   "bin": {
+    "edgar-cli": "dist/cli.js",
     "edgar": "dist/cli.js"
   },
   "files": [
@@ -41,13 +42,15 @@
     "filings"
   ],
   "dependencies": {
-    "cheerio": "^1.1.2",
+    "@joplin/turndown-plugin-gfm": "^1.0.64",
     "commander": "^14.0.1",
     "p-limit": "^7.1.1",
+    "turndown": "^7.2.2",
     "zod": "^4.1.5"
   },
   "devDependencies": {
     "@types/node": "^22.13.9",
+    "@types/turndown": "^5.0.6",
     "@typescript-eslint/eslint-plugin": "^8.44.0",
     "@typescript-eslint/parser": "^8.44.0",
     "eslint": "^8.57.1",