markshift 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,48 @@
1
+ # markshift
2
+
3
+ Smart format router and transformer for AI context files.
4
+
5
+ Converts between Markdown, HTML, and plain text based on *where the content is going*:
6
+ - Human-facing deliverables → HTML
7
+ - Agent loops and git-bound files → Markdown
8
+ - Terminal output → plain text
9
+
10
+ Includes a token audit command and a watch mode.
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ npm install -g markshift
16
+ ```
17
+
18
+ ## Commands
19
+
20
+ ```bash
21
+ markshift route <file> # Auto-detect context type and transform
22
+ markshift to-html <file> # Markdown → HTML
23
+ markshift to-html --optimize-tokens <file> # Markdown → HTML (token-optimized)
24
+ markshift to-md <file> # HTML → Markdown
25
+ markshift to-text <file> # HTML/MD → plain text
26
+ markshift audit <file> # Token count comparison across formats
27
+ markshift watch <dir> # Watch directory, auto-route on save
28
+ ```
29
+
30
+ ## Token Audit Example
31
+
32
+ ```
33
+ $ markshift audit context.md
34
+
35
+ ┌─────────────────┬────────────┬──────────┐
36
+ │ Format │ Tokens │ Delta │
37
+ ├─────────────────┼────────────┼──────────┤
38
+ │ Original │ 1,240 │ — │
39
+ │ → HTML │ 1,890 │ +650 │
40
+ │ → HTML (opt) │ 1,420 │ +180 │
41
+ │ → Markdown │ 980 │ -260 │
42
+ │ → Plain text │ 740 │ -500 │
43
+ └─────────────────┴────────────┴──────────┘
44
+ ```
45
+
46
+ ## License
47
+
48
+ MIT
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env node
2
+
3
+ // CLI entry point — wires commander.js commands to src/ handlers
4
+ // Each command imports its handler lazily to keep startup fast
5
+
6
+ import { program } from 'commander';
7
+ import { readFileSync } from 'fs';
8
+ import { fileURLToPath } from 'url';
9
+ import { dirname, join } from 'path';
10
+
11
+ const __dirname = dirname(fileURLToPath(import.meta.url));
12
+ const pkg = JSON.parse(readFileSync(join(__dirname, '../package.json'), 'utf8'));
13
+
14
+ program
15
+ .name('markshift')
16
+ .description('Smart format router and transformer for AI context files')
17
+ .version(pkg.version);
18
+
19
+ // markshift route <file>
20
+ // Auto-detect context type (human/agent/terminal) and transform accordingly
21
+ program
22
+ .command('route <path>')
23
+ .description('Auto-detect context type and transform to best format. Accepts file or directory.')
24
+ .option('--recursive', 'Process all files in directory recursively')
25
+ .action(async (target, opts) => {
26
+ const { stat } = await import('node:fs/promises');
27
+ const { route, routeDir } = await import('../src/router.js');
28
+ let info;
29
+ try {
30
+ info = await stat(target);
31
+ } catch (err) {
32
+ process.stderr.write(`[markshift] error: ${target}: ${err.message}\n`);
33
+ process.exit(1);
34
+ }
35
+ if (info.isDirectory()) {
36
+ await routeDir(target, opts);
37
+ } else {
38
+ await route(target);
39
+ }
40
+ });
41
+
42
+ // markshift to-html <file>
43
+ // Convert Markdown → structured HTML
44
+ // --optimize-tokens: strip redundant markup, extract inline CSS
45
+ program
46
+ .command('to-html <file>')
47
+ .description('Convert Markdown to structured HTML')
48
+ .option('--optimize-tokens', 'Strip redundant markup to reduce token count')
49
+ .action(async (file, opts) => {
50
+ const { toHtml } = await import('../src/transforms/toHtml.js');
51
+ await toHtml(file, opts);
52
+ });
53
+
54
+ // markshift to-md <file>
55
+ // Convert HTML → clean Markdown
56
+ program
57
+ .command('to-md <file>')
58
+ .description('Convert HTML to clean Markdown')
59
+ .action(async (file) => {
60
+ const { toMd } = await import('../src/transforms/toMd.js');
61
+ await toMd(file);
62
+ });
63
+
64
+ // markshift to-text <file>
65
+ // Convert HTML or Markdown → terminal-safe plain text (strips all markup)
66
+ program
67
+ .command('to-text <file>')
68
+ .description('Convert HTML/MD to terminal-safe plain text')
69
+ .action(async (file) => {
70
+ const { toText } = await import('../src/transforms/toText.js');
71
+ await toText(file);
72
+ });
73
+
74
+ // markshift audit <file>
75
+ // Compare token counts: original vs each possible transform output
76
+ program
77
+ .command('audit <file>')
78
+ .description('Compare token counts before and after each transform')
79
+ .action(async (file) => {
80
+ const { audit } = await import('../src/audit.js');
81
+ await audit(file);
82
+ });
83
+
84
+ // markshift watch <dir>
85
+ // Watch directory for file saves and auto-route each changed file
86
+ program
87
+ .command('watch <dir>')
88
+ .description('Watch directory and auto-route files on save')
89
+ .option('--ext <extensions>', 'Comma-separated file extensions to watch', 'md,html')
90
+ .action(async (dir, opts) => {
91
+ const { watch } = await import('../src/watcher.js');
92
+ await watch(dir, opts);
93
+ });
94
+
95
+ program.parse();
package/package.json ADDED
@@ -0,0 +1,41 @@
1
+ {
2
+ "name": "markshift",
3
+ "version": "0.1.0",
4
+ "description": "Smart format router and transformer for AI context files",
5
+ "bin": {
6
+ "markshift": "bin/markshift.js"
7
+ },
8
+ "files": [
9
+ "bin/",
10
+ "src/",
11
+ "README.md"
12
+ ],
13
+ "type": "module",
14
+ "engines": {
15
+ "node": ">=18.0.0"
16
+ },
17
+ "scripts": {
18
+ "test": "node --test tests/transforms/toHtml.test.js tests/transforms/toMd.test.js tests/transforms/toText.test.js tests/optimizer.test.js tests/audit.test.js tests/router.test.js",
19
+ "audit:braindump": "node -e \"import('./src/audit.js').then(m => m.audit('project-braindump.html'))\"",
20
+ "audit:claude": "node -e \"import('./src/audit.js').then(m => m.audit('CLAUDE.md'))\"",
21
+ "start": "node bin/markshift.js"
22
+ },
23
+ "keywords": [
24
+ "markdown",
25
+ "html",
26
+ "cli",
27
+ "ai",
28
+ "token",
29
+ "transform",
30
+ "router"
31
+ ],
32
+ "author": "",
33
+ "license": "MIT",
34
+ "dependencies": {
35
+ "chokidar": "^3.6.0",
36
+ "commander": "^12.0.0",
37
+ "marked": "^12.0.0",
38
+ "tiktoken": "^1.0.15",
39
+ "turndown": "^7.2.0"
40
+ }
41
+ }
package/src/audit.js ADDED
@@ -0,0 +1,119 @@
1
+ // Token audit — compare token counts across transform outputs
2
+ //
3
+ // Uses `tiktoken` with the cl100k_base encoding (GPT-4 / Claude compatible).
4
+ //
5
+ // For a given input file, reports:
6
+ // ┌─────────────────────┬─────────┬─────────┐
7
+ // │ Format │ Tokens │ Delta │
8
+ // ├─────────────────────┼─────────┼─────────┤
9
+ // │ Original │ 1,240 │ — │
10
+ // │ → HTML │ 1,890 │ +650 │
11
+ // │ → HTML (optimized) │ 1,420 │ +180 │
12
+ // │ → Markdown │ 980 │ -260 │
13
+ // │ → Plain text │ 740 │ -500 │
14
+ // └─────────────────────┴─────────┴─────────┘
15
+ //
16
+ // Exit code 0 always — audit is informational only.
17
+
18
+ import { extname } from 'node:path';
19
+ import { readFile } from 'node:fs/promises';
20
+ import { get_encoding } from 'tiktoken';
21
+ import { htmlFromMd } from './transforms/toHtml.js';
22
+ import { mdFromHtml } from './transforms/toMd.js';
23
+ import { textFrom } from './transforms/toText.js';
24
+ import { optimizeHtml } from './optimizer.js';
25
+
26
+ let _enc = null;
27
+ function getEncoder() {
28
+ if (!_enc) _enc = get_encoding('cl100k_base');
29
+ return _enc;
30
+ }
31
+
32
+ /**
33
+ * Count tokens in a string using cl100k_base encoding (GPT-4 / Claude compatible).
34
+ * @param {string} text
35
+ * @returns {number}
36
+ */
37
+ export function countTokens(text) {
38
+ if (!text) return 0;
39
+ return getEncoder().encode(text).length;
40
+ }
41
+
42
+ function fmt(n) {
43
+ return String(n).padStart(7);
44
+ }
45
+
46
+ function delta(base, n) {
47
+ const d = n - base;
48
+ if (d === 0) return ' — ';
49
+ const sign = d > 0 ? '+' : '';
50
+ return `${sign}${d}`.padStart(7);
51
+ }
52
+
53
+ /**
54
+ * Run token audit on a file and print comparison table to stdout.
55
+ * @param {string} filePath
56
+ */
57
+ export async function audit(filePath) {
58
+ let content;
59
+ try {
60
+ content = await readFile(filePath, 'utf8');
61
+ } catch (err) {
62
+ process.stderr.write(`[markshift] error: ${err.message}\n`);
63
+ process.exit(1);
64
+ }
65
+ const ext = extname(filePath).toLowerCase();
66
+
67
+ const isMd = ext === '.md' || ext === '.markdown';
68
+ const isHtml = ext === '.html' || ext === '.htm';
69
+
70
+ const originalTokens = countTokens(content);
71
+ const rows = [{ label: 'Original', tokens: originalTokens }];
72
+
73
+ if (!isMd && !isHtml) {
74
+ console.warn(`[markshift] audit: unsupported extension "${ext}" — showing token count only (no transforms run).`);
75
+ }
76
+
77
+ if (isMd) {
78
+ const html = htmlFromMd(content);
79
+ const htmlOpt = htmlFromMd(content, { optimizeTokens: true });
80
+ const txt = textFrom(content, ext);
81
+ rows.push(
82
+ { label: '→ HTML', tokens: countTokens(html) },
83
+ { label: '→ HTML (optimized)', tokens: countTokens(htmlOpt) },
84
+ { label: '→ Plain text', tokens: countTokens(txt) },
85
+ );
86
+ } else if (isHtml) {
87
+ const htmlOpt = optimizeHtml(content);
88
+ const md = mdFromHtml(content);
89
+ const txt = textFrom(content, ext);
90
+ rows.push(
91
+ { label: '→ HTML (optimized)', tokens: countTokens(htmlOpt) },
92
+ { label: '→ Markdown', tokens: countTokens(md) },
93
+ { label: '→ Plain text', tokens: countTokens(txt) },
94
+ );
95
+ }
96
+
97
+ const base = rows[0].tokens;
98
+ const labelW = 21;
99
+
100
+ const top = `┌${'─'.repeat(labelW)}┬${'─'.repeat(9)}┬${'─'.repeat(9)}┐`;
101
+ const sep = `├${'─'.repeat(labelW)}┼${'─'.repeat(9)}┼${'─'.repeat(9)}┤`;
102
+ const bot = `└${'─'.repeat(labelW)}┴${'─'.repeat(9)}┴${'─'.repeat(9)}┘`;
103
+ const head = `│ ${'Format'.padEnd(labelW - 2)} │ ${'Tokens'.padStart(7)} │ ${'Delta'.padStart(7)} │`;
104
+
105
+ const lines = [
106
+ `\nToken audit: ${filePath}\n`,
107
+ top, head, sep,
108
+ ];
109
+
110
+ for (const row of rows) {
111
+ const label = row.label.padEnd(labelW - 2);
112
+ const tok = fmt(row.tokens);
113
+ const d = row === rows[0] ? ' — ' : delta(base, row.tokens);
114
+ lines.push(`│ ${label} │ ${tok} │ ${d} │`);
115
+ }
116
+
117
+ lines.push(bot, '');
118
+ console.log(lines.join('\n'));
119
+ }
@@ -0,0 +1,110 @@
1
+ /**
2
+ * Token optimizer for HTML output.
3
+ *
4
+ * Goal: reduce token count of HTML without changing rendered output.
5
+ *
6
+ * Techniques (in pipeline order):
7
+ * 1. Strip HTML comments
8
+ * 2. Strip <script>/<style> block content (agent context never needs them)
9
+ * 3. Collapse inter-tag whitespace (non-pre contexts)
10
+ * 4. Extract repeated inline styles (3+ occurrences) to <style> block
11
+ *
12
+ * Known limitation: collapseWhitespace uses regex and does not distinguish
13
+ * <pre> from non-<pre> contexts. Whitespace inside <pre> blocks may be
14
+ * incorrectly collapsed. YAGNI for now — markshift context files rarely
15
+ * contain pre-formatted whitespace-sensitive content.
16
+ */
17
+
18
+ /**
19
+ * Full optimization pipeline. Runs all techniques in order.
20
+ * @param {string} html
21
+ * @returns {string}
22
+ */
23
+ export function optimizeHtml(html) {
24
+ let result = removeComments(html);
25
+ result = stripScriptStyle(result);
26
+ result = collapseWhitespace(result);
27
+ result = extractInlineStyles(result);
28
+ return result;
29
+ }
30
+
31
+ function removeComments(html) {
32
+ return html.replace(/<!--[\s\S]*?-->/g, '');
33
+ }
34
+
35
+ function stripScriptStyle(html) {
36
+ return html
37
+ .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
38
+ .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
39
+ }
40
+
41
+ function collapseWhitespace(html) {
42
+ // Normalize CRLF → LF first so regex patterns work on Windows
43
+ const normalized = html.replace(/\r\n/g, '\n');
44
+ return normalized
45
+ .replace(/>[ \t]*\n[ \t]*/g, '>')
46
+ .replace(/\n[ \t]*</g, '<')
47
+ .replace(/[ \t]{2,}/g, ' ');
48
+ }
49
+
50
+ /**
51
+ * Extract repeated inline styles (3+ occurrences) into a <style> block.
52
+ * Injects the <style> block before the first tag if no <head>, or inside <head>.
53
+ * @param {string} html
54
+ * @returns {string}
55
+ */
56
+ export function extractInlineStyles(html) {
57
+ const styleAttrPattern = / style="([^"]+)"/g;
58
+ const freq = new Map();
59
+ let match;
60
+
61
+ // Count frequency of each style declaration
62
+ while ((match = styleAttrPattern.exec(html)) !== null) {
63
+ const val = match[1];
64
+ freq.set(val, (freq.get(val) ?? 0) + 1);
65
+ }
66
+
67
+ // Only extract styles appearing 3+ times
68
+ const toExtract = new Map();
69
+ let classIndex = 0;
70
+ for (const [style, count] of freq) {
71
+ if (count >= 3) {
72
+ toExtract.set(style, `ms${classIndex++}`);
73
+ }
74
+ }
75
+
76
+ if (toExtract.size === 0) return html;
77
+
78
+ // NOTE: assumes ms0..msN class names are not already in use in this HTML.
79
+ // Build <style> block
80
+ const rules = [...toExtract.entries()]
81
+ .map(([style, cls]) => `.${cls}{${style}}`)
82
+ .join('');
83
+ const styleBlock = `<style>${rules}</style>`;
84
+
85
+ // Replace extracted inline styles with class references.
86
+ // Two-pass: first append to existing class attrs, then add new ones.
87
+ let result = html;
88
+ for (const [style, cls] of toExtract) {
89
+ const escaped = style.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
90
+ // Pass 1: element already has a class attr — append to it
91
+ result = result.replace(
92
+ new RegExp(`(class="[^"]*")([^>]*) style="${escaped}"`, 'g'),
93
+ (_, classAttr, between) => `${classAttr.slice(0, -1)} ${cls}"${between}`
94
+ );
95
+ // Pass 2: element has no class attr yet — add one
96
+ result = result.replace(
97
+ new RegExp(` style="${escaped}"`, 'g'),
98
+ ` class="${cls}"`
99
+ );
100
+ }
101
+
102
+ // Inject <style> block
103
+ if (result.includes('</head>')) {
104
+ result = result.replace('</head>', `${styleBlock}</head>`);
105
+ } else {
106
+ result = styleBlock + result;
107
+ }
108
+
109
+ return result;
110
+ }
package/src/router.js ADDED
@@ -0,0 +1,133 @@
1
+ import { readFile, readdir, stat } from 'node:fs/promises';
2
+ import { basename, extname, join } from 'node:path';
3
+ import { htmlFromMd } from './transforms/toHtml.js';
4
+ import { mdFromHtml } from './transforms/toMd.js';
5
+
6
+ // Agent-loop filenames — always stay as MD
7
+ const AGENT_NAMES = new Set([
8
+ 'claude.md', 'agents.md', 'skill.md', 'gemini.md',
9
+ 'readme.md', 'changelog.md', 'contributing.md',
10
+ 'license.md', 'security.md', 'codeowners',
11
+ ]);
12
+
13
+ // Path segments that signal agent context
14
+ const AGENT_PATHS = ['.claude/', 'prompts/', 'context/', 'skills/'];
15
+
16
+ // Path segments that signal human-facing context.
17
+ // Matched as bare substrings after normalization — paths containing these
18
+ // segments anywhere (e.g. node_modules/some-docs/) will match. Known limitation.
19
+ const HUMAN_PATHS = ['deliverables/', 'docs/', 'reports/', 'output/', 'dist/'];
20
+
21
+ /**
22
+ * Detect the context type of a file.
23
+ * Returns 'human' (→ HTML) or 'agent' (→ MD).
24
+ * Terminal/plain-text output is handled by the explicit `to-text` command,
25
+ * not by auto-detection (runtime context, not a file property).
26
+ * Pure function — no file I/O, no side effects.
27
+ * @param {string} filePath
28
+ * @param {string} content
29
+ * @returns {'human'|'agent'}
30
+ */
31
+ export function detectContextType(filePath, content) {
32
+ content = content ?? '';
33
+ const name = basename(filePath).toLowerCase();
34
+ const normalized = filePath.replace(/\\/g, '/').toLowerCase();
35
+ const ext = extname(filePath).toLowerCase();
36
+
37
+ // Agent-loop filenames
38
+ if (AGENT_NAMES.has(name)) return 'agent';
39
+
40
+ // Agent path signals
41
+ if (AGENT_PATHS.some(p => normalized.includes(p))) return 'agent';
42
+
43
+ // HTML extension → human-facing by definition
44
+ if (ext === '.html' || ext === '.htm') return 'human';
45
+
46
+ // Human-facing path signals
47
+ if (HUMAN_PATHS.some(p => normalized.includes(p))) return 'human';
48
+
49
+ // Content signal: MD file containing HTML tags → human-facing
50
+ if ((ext === '.md' || ext === '.markdown') && /<[a-z][^>]*>/i.test(content)) {
51
+ return 'human';
52
+ }
53
+
54
+ // Default: MD stays as MD (agent context)
55
+ return 'agent';
56
+ }
57
+
58
+ /**
59
+ * Detect context type for a file and transform it to the best format.
60
+ * Prints the result to stdout.
61
+ * @param {string} filePath
62
+ */
63
+ export async function route(filePath) {
64
+ let content;
65
+ try {
66
+ content = await readFile(filePath, 'utf8');
67
+ } catch (err) {
68
+ process.stderr.write(`[markshift] error: ${err.message}\n`);
69
+ process.exit(1);
70
+ }
71
+ const ext = extname(filePath).toLowerCase();
72
+
73
+ const supportedExts = new Set(['.md', '.markdown', '.html', '.htm']);
74
+ if (!supportedExts.has(ext)) {
75
+ process.stderr.write(`[markshift] route: unsupported file type "${ext}", skipping ${filePath}\n`);
76
+ return;
77
+ }
78
+
79
+ const contextType = detectContextType(filePath, content);
80
+
81
+ if (contextType === 'human') {
82
+ // HTML input stays HTML; MD input converts to HTML
83
+ const html = (ext === '.html' || ext === '.htm')
84
+ ? content
85
+ : htmlFromMd(content);
86
+ process.stdout.write(html);
87
+ return;
88
+ }
89
+
90
+ // agent → MD
91
+ const md = (ext === '.md' || ext === '.markdown')
92
+ ? content
93
+ : mdFromHtml(content);
94
+ process.stdout.write(md + '\n');
95
+ }
96
+
97
+ /**
98
+ * Route all matching files in a directory.
99
+ * @param {string} dir
100
+ * @param {{ recursive?: boolean }} opts
101
+ */
102
+ export async function routeDir(dir, opts = {}) {
103
+ let entries;
104
+ try {
105
+ entries = await readdir(dir);
106
+ } catch (err) {
107
+ process.stderr.write(`[markshift] error reading directory ${dir}: ${err.message}\n`);
108
+ return;
109
+ }
110
+ for (const entry of entries) {
111
+ if (entry.startsWith('.') || entry === 'node_modules') continue;
112
+ const fullPath = join(dir, entry);
113
+ let info;
114
+ try {
115
+ info = await stat(fullPath);
116
+ } catch (err) {
117
+ process.stderr.write(`[markshift] error: ${err.message}\n`);
118
+ continue;
119
+ }
120
+ if (info.isDirectory()) {
121
+ if (opts.recursive) await routeDir(fullPath, opts);
122
+ continue;
123
+ }
124
+ const ext = extname(entry).toLowerCase();
125
+ if (ext === '.md' || ext === '.markdown' || ext === '.html' || ext === '.htm') {
126
+ try {
127
+ await route(fullPath);
128
+ } catch (err) {
129
+ process.stderr.write(`[markshift] error routing ${fullPath}: ${err.message}\n`);
130
+ }
131
+ }
132
+ }
133
+ }
@@ -0,0 +1,42 @@
1
+ // Markdown → structured HTML transform
2
+ //
3
+ // --optimize-tokens mode:
4
+ // - Strips HTML comments and script/style blocks
5
+ // - Collapses inter-tag whitespace
6
+ // - Extracts repeated inline styles into a <style> block
7
+ //
8
+ // Output written to stdout by default.
9
+
10
+ import { readFile } from 'node:fs/promises';
11
+ import { marked } from 'marked';
12
+ import { optimizeHtml } from '../optimizer.js';
13
+
14
+ /**
15
+ * Pure transform: Markdown string → HTML string.
16
+ * Returns fragment HTML (no <!DOCTYPE>), suitable for agent context.
17
+ * @param {string} content
18
+ * @param {{ optimizeTokens?: boolean }} opts
19
+ * @returns {string}
20
+ */
21
+ export function htmlFromMd(content, opts = {}) {
22
+ let html = marked(content);
23
+ if (opts.optimizeTokens) html = optimizeHtml(html);
24
+ return html;
25
+ }
26
+
27
+ /**
28
+ * CLI wrapper: reads filePath, transforms, writes to stdout.
29
+ * process.stdout.write — no trailing newline (HTML fragment output for piping).
30
+ * @param {string} filePath
31
+ * @param {{ optimizeTokens?: boolean }} opts
32
+ */
33
+ export async function toHtml(filePath, opts = {}) {
34
+ let content;
35
+ try {
36
+ content = await readFile(filePath, 'utf8');
37
+ } catch (err) {
38
+ process.stderr.write(`[markshift] error: ${err.message}\n`);
39
+ process.exit(1);
40
+ }
41
+ process.stdout.write(htmlFromMd(content, opts));
42
+ }
@@ -0,0 +1,52 @@
1
+ // HTML → clean Markdown transform
2
+ //
3
+ // Uses `turndown` with these options:
4
+ // - headingStyle: 'atx' (# H1, ## H2, ...)
5
+ // - codeBlockStyle: 'fenced' (``` fences)
6
+ // - bulletListMarker: '-'
7
+ //
8
+ // Post-processing:
9
+ // - Collapse 3+ blank lines → 2
10
+ // - Trim trailing whitespace per line
11
+ // - Strip HTML comments
12
+ //
13
+ // Output written to stdout.
14
+
15
+ import { readFile } from 'node:fs/promises';
16
+ import TurndownService from 'turndown';
17
+
18
+ const td = new TurndownService({
19
+ headingStyle: 'atx',
20
+ codeBlockStyle: 'fenced',
21
+ bulletListMarker: '-',
22
+ });
23
+
24
+ /**
25
+ * Pure transform: HTML string → clean Markdown string.
26
+ * @param {string} html
27
+ * @returns {string}
28
+ */
29
+ export function mdFromHtml(html) {
30
+ // Strip HTML comments before turndown sees them
31
+ const stripped = html.replace(/<!--[\s\S]*?-->/g, '');
32
+ let md = td.turndown(stripped);
33
+ // Collapse 3+ blank lines → 2, trim trailing whitespace per line
34
+ md = md.replace(/\n{3,}/g, '\n\n').replace(/[ \t]+$/gm, '');
35
+ return md;
36
+ }
37
+
38
+ /**
39
+ * CLI wrapper: reads filePath, transforms, writes to stdout.
40
+ * process.stdout.write — no trailing newline for piped output.
41
+ * @param {string} filePath
42
+ */
43
+ export async function toMd(filePath) {
44
+ let content;
45
+ try {
46
+ content = await readFile(filePath, 'utf8');
47
+ } catch (err) {
48
+ process.stderr.write(`[markshift] error: ${err.message}\n`);
49
+ process.exit(1);
50
+ }
51
+ process.stdout.write(mdFromHtml(content) + '\n');
52
+ }
@@ -0,0 +1,97 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import { extname } from 'node:path';
3
+ import { marked } from 'marked';
4
+
5
+ const ENTITIES = {
6
+ '&amp;': '&',
7
+ '&lt;': '<',
8
+ '&gt;': '>',
9
+ '&quot;': '"',
10
+ '&#39;': "'",
11
+ '&nbsp;': ' ',
12
+ };
13
+
14
+ /**
15
+ * Decode HTML entities to their character equivalents.
16
+ * @param {string} str
17
+ * @returns {string}
18
+ */
19
+ function decodeEntities(str) {
20
+ return str.replace(/&[a-z#0-9]+;/gi, e => ENTITIES[e] ?? e);
21
+ }
22
+
23
+ /**
24
+ * Word-wrap text to specified column width.
25
+ * Force-breaks words that exceed width.
26
+ * @param {string} text
27
+ * @param {number} width
28
+ * @returns {string}
29
+ */
30
+ function wordWrap(text, width = 80) {
31
+ return text.split('\n').map(line => {
32
+ if (line.length <= width) return line;
33
+ const words = line.split(' ');
34
+ const out = [];
35
+ let cur = '';
36
+ for (const word of words) {
37
+ // Force-break words that exceed width on their own
38
+ const chunks = word.match(new RegExp(`.{1,${width}}`, 'g')) ?? [word];
39
+ for (const chunk of chunks) {
40
+ const next = cur ? `${cur} ${chunk}` : chunk;
41
+ if (next.length > width) { if (cur) out.push(cur); cur = chunk; }
42
+ else cur = next;
43
+ }
44
+ }
45
+ if (cur) out.push(cur);
46
+ return out.join('\n');
47
+ }).join('\n');
48
+ }
49
+
50
+ /**
51
+ * Pure transform: HTML or Markdown string → terminal-safe plain text.
52
+ * Strips tags, decodes entities, normalizes whitespace, wraps to 80 columns.
53
+ * @param {string} content
54
+ * @param {string} ext File extension including dot, e.g. '.md' or '.html'
55
+ * @returns {string}
56
+ */
57
+ export function textFrom(content, ext) {
58
+ let html = content;
59
+
60
+ // Convert Markdown to HTML if needed
61
+ if (ext === '.md' || ext === '.markdown') {
62
+ html = marked(content);
63
+ }
64
+
65
+ // Strip script/style blocks (content would otherwise appear as plain text)
66
+ html = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ');
67
+ html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ');
68
+
69
+ // Strip HTML tags
70
+ let text = html.replace(/<[^>]+>/gs, ' ');
71
+
72
+ // Decode HTML entities
73
+ text = decodeEntities(text);
74
+
75
+ // Normalize whitespace: collapse runs, trim lines, collapse excessive newlines
76
+ text = text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
77
+
78
+ // Word-wrap to 80 columns
79
+ return wordWrap(text);
80
+ }
81
+
82
+ /**
83
+ * CLI wrapper: reads filePath, transforms, writes to stdout.
84
+ * process.stdout.write with trailing newline (POSIX convention for text output).
85
+ * @param {string} filePath
86
+ */
87
+ export async function toText(filePath) {
88
+ let content;
89
+ try {
90
+ content = await readFile(filePath, 'utf8');
91
+ } catch (err) {
92
+ process.stderr.write(`[markshift] error: ${err.message}\n`);
93
+ process.exit(1);
94
+ }
95
+ const ext = extname(filePath).toLowerCase();
96
+ process.stdout.write(textFrom(content, ext) + '\n');
97
+ }
package/src/watcher.js ADDED
@@ -0,0 +1,60 @@
1
+ // Directory watcher — auto-routes files on save
2
+ //
3
+ // Uses `chokidar` (cross-platform fs watch with debounce).
4
+ //
5
+ // Behavior:
6
+ // - Watches *.md and *.html files in the target directory (recursive)
7
+ // - On change: calls router.route(filePath)
8
+ // - Debounce: 300ms (avoid double-fire on editor atomic saves)
9
+ // - Prints a log line per event: [HH:MM:SS] routed path/to/file
10
+ // - Ignores: node_modules, .git, hidden files
11
+ //
12
+ // Runs until SIGINT (Ctrl+C).
13
+
14
+ import chokidar from 'chokidar';
15
+ import { route } from './router.js';
16
+
17
+ /**
18
+ * Watch a directory and auto-route files on save.
19
+ * Logs events to stderr (not stdout) so transform output stays clean for piping.
20
+ * @param {string} dir
21
+ * @param {{ ext?: string }} opts ext = comma-separated extensions, e.g. 'md,html'
22
+ */
23
+ export async function watch(dir, opts = {}) {
24
+ const extensions = opts.ext
25
+ ? opts.ext.split(',').map(e => e.trim().replace(/^\./, ''))
26
+ : ['md', 'html'];
27
+
28
+ const normalizedDir = dir.replace(/\\/g, '/');
29
+ const patterns = extensions.map(e => `${normalizedDir}/**/*.${e}`);
30
+
31
+ const watcher = chokidar.watch(patterns, {
32
+ ignored: [/node_modules/, /\.git/, /(^|[/\\])\../],
33
+ persistent: true,
34
+ ignoreInitial: true,
35
+ });
36
+
37
+ const debounceMap = new Map();
38
+
39
+ watcher.on('change', (filePath) => {
40
+ if (debounceMap.has(filePath)) clearTimeout(debounceMap.get(filePath));
41
+ debounceMap.set(filePath, setTimeout(async () => {
42
+ debounceMap.delete(filePath);
43
+ const time = new Date().toTimeString().slice(0, 8);
44
+ try {
45
+ await route(filePath);
46
+ process.stderr.write(`[${time}] routed ${filePath}\n`);
47
+ } catch (err) {
48
+ process.stderr.write(`[${time}] error ${filePath} — ${err.message}\n`);
49
+ }
50
+ }, 300));
51
+ });
52
+
53
+ process.once('SIGINT', () => {
54
+ for (const id of debounceMap.values()) clearTimeout(id);
55
+ debounceMap.clear();
56
+ watcher.close().then(() => process.exit(0));
57
+ });
58
+
59
+ process.stderr.write(`watching ${normalizedDir} for ${extensions.map(e => `*.${e}`).join(', ')} changes...\n`);
60
+ }