markshift 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -0
- package/bin/markshift.js +95 -0
- package/package.json +41 -0
- package/src/audit.js +119 -0
- package/src/optimizer.js +110 -0
- package/src/router.js +133 -0
- package/src/transforms/toHtml.js +42 -0
- package/src/transforms/toMd.js +52 -0
- package/src/transforms/toText.js +97 -0
- package/src/watcher.js +60 -0
package/README.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# markshift
|
|
2
|
+
|
|
3
|
+
Smart format router and transformer for AI context files.
|
|
4
|
+
|
|
5
|
+
Converts between Markdown, HTML, and plain text based on *where the content is going*:
|
|
6
|
+
- Human-facing deliverables → HTML
|
|
7
|
+
- Agent loops and git-bound files → Markdown
|
|
8
|
+
- Terminal output → plain text
|
|
9
|
+
|
|
10
|
+
Includes a token audit command and a watch mode.
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
npm install -g markshift
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Commands
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
markshift route <file> # Auto-detect context type and transform
|
|
22
|
+
markshift to-html <file> # Markdown → HTML
|
|
23
|
+
markshift to-html --optimize-tokens <file> # Markdown → HTML (token-optimized)
|
|
24
|
+
markshift to-md <file> # HTML → Markdown
|
|
25
|
+
markshift to-text <file> # HTML/MD → plain text
|
|
26
|
+
markshift audit <file> # Token count comparison across formats
|
|
27
|
+
markshift watch <dir> # Watch directory, auto-route on save
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Token Audit Example
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
$ markshift audit context.md
|
|
34
|
+
|
|
35
|
+
┌─────────────────┬────────────┬──────────┐
|
|
36
|
+
│ Format │ Tokens │ Delta │
|
|
37
|
+
├─────────────────┼────────────┼──────────┤
|
|
38
|
+
│ Original │ 1,240 │ — │
|
|
39
|
+
│ → HTML │ 1,890 │ +650 │
|
|
40
|
+
│ → HTML (opt) │ 1,420 │ +180 │
|
|
41
|
+
│ → Markdown │ 980 │ -260 │
|
|
42
|
+
│ → Plain text │ 740 │ -500 │
|
|
43
|
+
└─────────────────┴────────────┴──────────┘
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## License
|
|
47
|
+
|
|
48
|
+
MIT
|
package/bin/markshift.js
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// CLI entry point — wires commander.js commands to src/ handlers
|
|
4
|
+
// Each command imports its handler lazily to keep startup fast
|
|
5
|
+
|
|
6
|
+
import { program } from 'commander';
|
|
7
|
+
import { readFileSync } from 'fs';
|
|
8
|
+
import { fileURLToPath } from 'url';
|
|
9
|
+
import { dirname, join } from 'path';
|
|
10
|
+
|
|
11
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
const pkg = JSON.parse(readFileSync(join(__dirname, '../package.json'), 'utf8'));
|
|
13
|
+
|
|
14
|
+
program
|
|
15
|
+
.name('markshift')
|
|
16
|
+
.description('Smart format router and transformer for AI context files')
|
|
17
|
+
.version(pkg.version);
|
|
18
|
+
|
|
19
|
+
// markshift route <file>
|
|
20
|
+
// Auto-detect context type (human/agent/terminal) and transform accordingly
|
|
21
|
+
program
|
|
22
|
+
.command('route <path>')
|
|
23
|
+
.description('Auto-detect context type and transform to best format. Accepts file or directory.')
|
|
24
|
+
.option('--recursive', 'Process all files in directory recursively')
|
|
25
|
+
.action(async (target, opts) => {
|
|
26
|
+
const { stat } = await import('node:fs/promises');
|
|
27
|
+
const { route, routeDir } = await import('../src/router.js');
|
|
28
|
+
let info;
|
|
29
|
+
try {
|
|
30
|
+
info = await stat(target);
|
|
31
|
+
} catch (err) {
|
|
32
|
+
process.stderr.write(`[markshift] error: ${target}: ${err.message}\n`);
|
|
33
|
+
process.exit(1);
|
|
34
|
+
}
|
|
35
|
+
if (info.isDirectory()) {
|
|
36
|
+
await routeDir(target, opts);
|
|
37
|
+
} else {
|
|
38
|
+
await route(target);
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
// markshift to-html <file>
|
|
43
|
+
// Convert Markdown → structured HTML
|
|
44
|
+
// --optimize-tokens: strip redundant markup, extract inline CSS
|
|
45
|
+
program
|
|
46
|
+
.command('to-html <file>')
|
|
47
|
+
.description('Convert Markdown to structured HTML')
|
|
48
|
+
.option('--optimize-tokens', 'Strip redundant markup to reduce token count')
|
|
49
|
+
.action(async (file, opts) => {
|
|
50
|
+
const { toHtml } = await import('../src/transforms/toHtml.js');
|
|
51
|
+
await toHtml(file, opts);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// markshift to-md <file>
|
|
55
|
+
// Convert HTML → clean Markdown
|
|
56
|
+
program
|
|
57
|
+
.command('to-md <file>')
|
|
58
|
+
.description('Convert HTML to clean Markdown')
|
|
59
|
+
.action(async (file) => {
|
|
60
|
+
const { toMd } = await import('../src/transforms/toMd.js');
|
|
61
|
+
await toMd(file);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
// markshift to-text <file>
|
|
65
|
+
// Convert HTML or Markdown → terminal-safe plain text (strips all markup)
|
|
66
|
+
program
|
|
67
|
+
.command('to-text <file>')
|
|
68
|
+
.description('Convert HTML/MD to terminal-safe plain text')
|
|
69
|
+
.action(async (file) => {
|
|
70
|
+
const { toText } = await import('../src/transforms/toText.js');
|
|
71
|
+
await toText(file);
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
// markshift audit <file>
|
|
75
|
+
// Compare token counts: original vs each possible transform output
|
|
76
|
+
program
|
|
77
|
+
.command('audit <file>')
|
|
78
|
+
.description('Compare token counts before and after each transform')
|
|
79
|
+
.action(async (file) => {
|
|
80
|
+
const { audit } = await import('../src/audit.js');
|
|
81
|
+
await audit(file);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
// markshift watch <dir>
|
|
85
|
+
// Watch directory for file saves and auto-route each changed file
|
|
86
|
+
program
|
|
87
|
+
.command('watch <dir>')
|
|
88
|
+
.description('Watch directory and auto-route files on save')
|
|
89
|
+
.option('--ext <extensions>', 'Comma-separated file extensions to watch', 'md,html')
|
|
90
|
+
.action(async (dir, opts) => {
|
|
91
|
+
const { watch } = await import('../src/watcher.js');
|
|
92
|
+
await watch(dir, opts);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
program.parse();
|
package/package.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "markshift",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Smart format router and transformer for AI context files",
|
|
5
|
+
"bin": {
|
|
6
|
+
"markshift": "bin/markshift.js"
|
|
7
|
+
},
|
|
8
|
+
"files": [
|
|
9
|
+
"bin/",
|
|
10
|
+
"src/",
|
|
11
|
+
"README.md"
|
|
12
|
+
],
|
|
13
|
+
"type": "module",
|
|
14
|
+
"engines": {
|
|
15
|
+
"node": ">=18.0.0"
|
|
16
|
+
},
|
|
17
|
+
"scripts": {
|
|
18
|
+
"test": "node --test tests/transforms/toHtml.test.js tests/transforms/toMd.test.js tests/transforms/toText.test.js tests/optimizer.test.js tests/audit.test.js tests/router.test.js",
|
|
19
|
+
"audit:braindump": "node -e \"import('./src/audit.js').then(m => m.audit('project-braindump.html'))\"",
|
|
20
|
+
"audit:claude": "node -e \"import('./src/audit.js').then(m => m.audit('CLAUDE.md'))\"",
|
|
21
|
+
"start": "node bin/markshift.js"
|
|
22
|
+
},
|
|
23
|
+
"keywords": [
|
|
24
|
+
"markdown",
|
|
25
|
+
"html",
|
|
26
|
+
"cli",
|
|
27
|
+
"ai",
|
|
28
|
+
"token",
|
|
29
|
+
"transform",
|
|
30
|
+
"router"
|
|
31
|
+
],
|
|
32
|
+
"author": "",
|
|
33
|
+
"license": "MIT",
|
|
34
|
+
"dependencies": {
|
|
35
|
+
"chokidar": "^3.6.0",
|
|
36
|
+
"commander": "^12.0.0",
|
|
37
|
+
"marked": "^12.0.0",
|
|
38
|
+
"tiktoken": "^1.0.15",
|
|
39
|
+
"turndown": "^7.2.0"
|
|
40
|
+
}
|
|
41
|
+
}
|
package/src/audit.js
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
// Token audit — compare token counts across transform outputs
|
|
2
|
+
//
|
|
3
|
+
// Uses `tiktoken` with the cl100k_base encoding (GPT-4 / Claude compatible).
|
|
4
|
+
//
|
|
5
|
+
// For a given input file, reports:
|
|
6
|
+
// ┌─────────────────────┬─────────┬─────────┐
|
|
7
|
+
// │ Format │ Tokens │ Delta │
|
|
8
|
+
// ├─────────────────────┼─────────┼─────────┤
|
|
9
|
+
// │ Original │ 1,240 │ — │
|
|
10
|
+
// │ → HTML │ 1,890 │ +650 │
|
|
11
|
+
// │ → HTML (optimized) │ 1,420 │ +180 │
|
|
12
|
+
// │ → Markdown │ 980 │ -260 │
|
|
13
|
+
// │ → Plain text │ 740 │ -500 │
|
|
14
|
+
// └─────────────────────┴─────────┴─────────┘
|
|
15
|
+
//
|
|
16
|
+
// Exit code 0 always — audit is informational only.
|
|
17
|
+
|
|
18
|
+
import { extname } from 'node:path';
|
|
19
|
+
import { readFile } from 'node:fs/promises';
|
|
20
|
+
import { get_encoding } from 'tiktoken';
|
|
21
|
+
import { htmlFromMd } from './transforms/toHtml.js';
|
|
22
|
+
import { mdFromHtml } from './transforms/toMd.js';
|
|
23
|
+
import { textFrom } from './transforms/toText.js';
|
|
24
|
+
import { optimizeHtml } from './optimizer.js';
|
|
25
|
+
|
|
26
|
+
let _enc = null;
|
|
27
|
+
function getEncoder() {
|
|
28
|
+
if (!_enc) _enc = get_encoding('cl100k_base');
|
|
29
|
+
return _enc;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Count tokens in a string using cl100k_base encoding (GPT-4 / Claude compatible).
|
|
34
|
+
* @param {string} text
|
|
35
|
+
* @returns {number}
|
|
36
|
+
*/
|
|
37
|
+
export function countTokens(text) {
|
|
38
|
+
if (!text) return 0;
|
|
39
|
+
return getEncoder().encode(text).length;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function fmt(n) {
|
|
43
|
+
return String(n).padStart(7);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function delta(base, n) {
|
|
47
|
+
const d = n - base;
|
|
48
|
+
if (d === 0) return ' — ';
|
|
49
|
+
const sign = d > 0 ? '+' : '';
|
|
50
|
+
return `${sign}${d}`.padStart(7);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Run token audit on a file and print comparison table to stdout.
|
|
55
|
+
* @param {string} filePath
|
|
56
|
+
*/
|
|
57
|
+
export async function audit(filePath) {
|
|
58
|
+
let content;
|
|
59
|
+
try {
|
|
60
|
+
content = await readFile(filePath, 'utf8');
|
|
61
|
+
} catch (err) {
|
|
62
|
+
process.stderr.write(`[markshift] error: ${err.message}\n`);
|
|
63
|
+
process.exit(1);
|
|
64
|
+
}
|
|
65
|
+
const ext = extname(filePath).toLowerCase();
|
|
66
|
+
|
|
67
|
+
const isMd = ext === '.md' || ext === '.markdown';
|
|
68
|
+
const isHtml = ext === '.html' || ext === '.htm';
|
|
69
|
+
|
|
70
|
+
const originalTokens = countTokens(content);
|
|
71
|
+
const rows = [{ label: 'Original', tokens: originalTokens }];
|
|
72
|
+
|
|
73
|
+
if (!isMd && !isHtml) {
|
|
74
|
+
console.warn(`[markshift] audit: unsupported extension "${ext}" — showing token count only (no transforms run).`);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (isMd) {
|
|
78
|
+
const html = htmlFromMd(content);
|
|
79
|
+
const htmlOpt = htmlFromMd(content, { optimizeTokens: true });
|
|
80
|
+
const txt = textFrom(content, ext);
|
|
81
|
+
rows.push(
|
|
82
|
+
{ label: '→ HTML', tokens: countTokens(html) },
|
|
83
|
+
{ label: '→ HTML (optimized)', tokens: countTokens(htmlOpt) },
|
|
84
|
+
{ label: '→ Plain text', tokens: countTokens(txt) },
|
|
85
|
+
);
|
|
86
|
+
} else if (isHtml) {
|
|
87
|
+
const htmlOpt = optimizeHtml(content);
|
|
88
|
+
const md = mdFromHtml(content);
|
|
89
|
+
const txt = textFrom(content, ext);
|
|
90
|
+
rows.push(
|
|
91
|
+
{ label: '→ HTML (optimized)', tokens: countTokens(htmlOpt) },
|
|
92
|
+
{ label: '→ Markdown', tokens: countTokens(md) },
|
|
93
|
+
{ label: '→ Plain text', tokens: countTokens(txt) },
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const base = rows[0].tokens;
|
|
98
|
+
const labelW = 21;
|
|
99
|
+
|
|
100
|
+
const top = `┌${'─'.repeat(labelW)}┬${'─'.repeat(9)}┬${'─'.repeat(9)}┐`;
|
|
101
|
+
const sep = `├${'─'.repeat(labelW)}┼${'─'.repeat(9)}┼${'─'.repeat(9)}┤`;
|
|
102
|
+
const bot = `└${'─'.repeat(labelW)}┴${'─'.repeat(9)}┴${'─'.repeat(9)}┘`;
|
|
103
|
+
const head = `│ ${'Format'.padEnd(labelW - 2)} │ ${'Tokens'.padStart(7)} │ ${'Delta'.padStart(7)} │`;
|
|
104
|
+
|
|
105
|
+
const lines = [
|
|
106
|
+
`\nToken audit: ${filePath}\n`,
|
|
107
|
+
top, head, sep,
|
|
108
|
+
];
|
|
109
|
+
|
|
110
|
+
for (const row of rows) {
|
|
111
|
+
const label = row.label.padEnd(labelW - 2);
|
|
112
|
+
const tok = fmt(row.tokens);
|
|
113
|
+
const d = row === rows[0] ? ' — ' : delta(base, row.tokens);
|
|
114
|
+
lines.push(`│ ${label} │ ${tok} │ ${d} │`);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
lines.push(bot, '');
|
|
118
|
+
console.log(lines.join('\n'));
|
|
119
|
+
}
|
package/src/optimizer.js
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token optimizer for HTML output.
|
|
3
|
+
*
|
|
4
|
+
* Goal: reduce token count of HTML without changing rendered output.
|
|
5
|
+
*
|
|
6
|
+
* Techniques (in pipeline order):
|
|
7
|
+
* 1. Strip HTML comments
|
|
8
|
+
* 2. Strip <script>/<style> block content (agent context never needs them)
|
|
9
|
+
* 3. Collapse inter-tag whitespace (non-pre contexts)
|
|
10
|
+
* 4. Extract repeated inline styles (3+ occurrences) to <style> block
|
|
11
|
+
*
|
|
12
|
+
* Known limitation: collapseWhitespace uses regex and does not distinguish
|
|
13
|
+
* <pre> from non-<pre> contexts. Whitespace inside <pre> blocks may be
|
|
14
|
+
* incorrectly collapsed. YAGNI for now — markshift context files rarely
|
|
15
|
+
* contain pre-formatted whitespace-sensitive content.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Full optimization pipeline. Runs all techniques in order.
|
|
20
|
+
* @param {string} html
|
|
21
|
+
* @returns {string}
|
|
22
|
+
*/
|
|
23
|
+
export function optimizeHtml(html) {
|
|
24
|
+
let result = removeComments(html);
|
|
25
|
+
result = stripScriptStyle(result);
|
|
26
|
+
result = collapseWhitespace(result);
|
|
27
|
+
result = extractInlineStyles(result);
|
|
28
|
+
return result;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function removeComments(html) {
|
|
32
|
+
return html.replace(/<!--[\s\S]*?-->/g, '');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function stripScriptStyle(html) {
|
|
36
|
+
return html
|
|
37
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
38
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function collapseWhitespace(html) {
|
|
42
|
+
// Normalize CRLF → LF first so regex patterns work on Windows
|
|
43
|
+
const normalized = html.replace(/\r\n/g, '\n');
|
|
44
|
+
return normalized
|
|
45
|
+
.replace(/>[ \t]*\n[ \t]*/g, '>')
|
|
46
|
+
.replace(/\n[ \t]*</g, '<')
|
|
47
|
+
.replace(/[ \t]{2,}/g, ' ');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Extract repeated inline styles (3+ occurrences) into a <style> block.
|
|
52
|
+
* Injects the <style> block before the first tag if no <head>, or inside <head>.
|
|
53
|
+
* @param {string} html
|
|
54
|
+
* @returns {string}
|
|
55
|
+
*/
|
|
56
|
+
export function extractInlineStyles(html) {
|
|
57
|
+
const styleAttrPattern = / style="([^"]+)"/g;
|
|
58
|
+
const freq = new Map();
|
|
59
|
+
let match;
|
|
60
|
+
|
|
61
|
+
// Count frequency of each style declaration
|
|
62
|
+
while ((match = styleAttrPattern.exec(html)) !== null) {
|
|
63
|
+
const val = match[1];
|
|
64
|
+
freq.set(val, (freq.get(val) ?? 0) + 1);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Only extract styles appearing 3+ times
|
|
68
|
+
const toExtract = new Map();
|
|
69
|
+
let classIndex = 0;
|
|
70
|
+
for (const [style, count] of freq) {
|
|
71
|
+
if (count >= 3) {
|
|
72
|
+
toExtract.set(style, `ms${classIndex++}`);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (toExtract.size === 0) return html;
|
|
77
|
+
|
|
78
|
+
// NOTE: assumes ms0..msN class names are not already in use in this HTML.
|
|
79
|
+
// Build <style> block
|
|
80
|
+
const rules = [...toExtract.entries()]
|
|
81
|
+
.map(([style, cls]) => `.${cls}{${style}}`)
|
|
82
|
+
.join('');
|
|
83
|
+
const styleBlock = `<style>${rules}</style>`;
|
|
84
|
+
|
|
85
|
+
// Replace extracted inline styles with class references.
|
|
86
|
+
// Two-pass: first append to existing class attrs, then add new ones.
|
|
87
|
+
let result = html;
|
|
88
|
+
for (const [style, cls] of toExtract) {
|
|
89
|
+
const escaped = style.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
90
|
+
// Pass 1: element already has a class attr — append to it
|
|
91
|
+
result = result.replace(
|
|
92
|
+
new RegExp(`(class="[^"]*")([^>]*) style="${escaped}"`, 'g'),
|
|
93
|
+
(_, classAttr, between) => `${classAttr.slice(0, -1)} ${cls}"${between}`
|
|
94
|
+
);
|
|
95
|
+
// Pass 2: element has no class attr yet — add one
|
|
96
|
+
result = result.replace(
|
|
97
|
+
new RegExp(` style="${escaped}"`, 'g'),
|
|
98
|
+
` class="${cls}"`
|
|
99
|
+
);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Inject <style> block
|
|
103
|
+
if (result.includes('</head>')) {
|
|
104
|
+
result = result.replace('</head>', `${styleBlock}</head>`);
|
|
105
|
+
} else {
|
|
106
|
+
result = styleBlock + result;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return result;
|
|
110
|
+
}
|
package/src/router.js
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import { readFile, readdir, stat } from 'node:fs/promises';
|
|
2
|
+
import { basename, extname, join } from 'node:path';
|
|
3
|
+
import { htmlFromMd } from './transforms/toHtml.js';
|
|
4
|
+
import { mdFromHtml } from './transforms/toMd.js';
|
|
5
|
+
|
|
6
|
+
// Agent-loop filenames — always stay as MD
|
|
7
|
+
const AGENT_NAMES = new Set([
|
|
8
|
+
'claude.md', 'agents.md', 'skill.md', 'gemini.md',
|
|
9
|
+
'readme.md', 'changelog.md', 'contributing.md',
|
|
10
|
+
'license.md', 'security.md', 'codeowners',
|
|
11
|
+
]);
|
|
12
|
+
|
|
13
|
+
// Path segments that signal agent context
|
|
14
|
+
const AGENT_PATHS = ['.claude/', 'prompts/', 'context/', 'skills/'];
|
|
15
|
+
|
|
16
|
+
// Path segments that signal human-facing context.
|
|
17
|
+
// Matched as bare substrings after normalization — paths containing these
|
|
18
|
+
// segments anywhere (e.g. node_modules/some-docs/) will match. Known limitation.
|
|
19
|
+
const HUMAN_PATHS = ['deliverables/', 'docs/', 'reports/', 'output/', 'dist/'];
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Detect the context type of a file.
|
|
23
|
+
* Returns 'human' (→ HTML) or 'agent' (→ MD).
|
|
24
|
+
* Terminal/plain-text output is handled by the explicit `to-text` command,
|
|
25
|
+
* not by auto-detection (runtime context, not a file property).
|
|
26
|
+
* Pure function — no file I/O, no side effects.
|
|
27
|
+
* @param {string} filePath
|
|
28
|
+
* @param {string} content
|
|
29
|
+
* @returns {'human'|'agent'}
|
|
30
|
+
*/
|
|
31
|
+
export function detectContextType(filePath, content) {
|
|
32
|
+
content = content ?? '';
|
|
33
|
+
const name = basename(filePath).toLowerCase();
|
|
34
|
+
const normalized = filePath.replace(/\\/g, '/').toLowerCase();
|
|
35
|
+
const ext = extname(filePath).toLowerCase();
|
|
36
|
+
|
|
37
|
+
// Agent-loop filenames
|
|
38
|
+
if (AGENT_NAMES.has(name)) return 'agent';
|
|
39
|
+
|
|
40
|
+
// Agent path signals
|
|
41
|
+
if (AGENT_PATHS.some(p => normalized.includes(p))) return 'agent';
|
|
42
|
+
|
|
43
|
+
// HTML extension → human-facing by definition
|
|
44
|
+
if (ext === '.html' || ext === '.htm') return 'human';
|
|
45
|
+
|
|
46
|
+
// Human-facing path signals
|
|
47
|
+
if (HUMAN_PATHS.some(p => normalized.includes(p))) return 'human';
|
|
48
|
+
|
|
49
|
+
// Content signal: MD file containing HTML tags → human-facing
|
|
50
|
+
if ((ext === '.md' || ext === '.markdown') && /<[a-z][^>]*>/i.test(content)) {
|
|
51
|
+
return 'human';
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Default: MD stays as MD (agent context)
|
|
55
|
+
return 'agent';
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Detect context type for a file and transform it to the best format.
|
|
60
|
+
* Prints the result to stdout.
|
|
61
|
+
* @param {string} filePath
|
|
62
|
+
*/
|
|
63
|
+
export async function route(filePath) {
|
|
64
|
+
let content;
|
|
65
|
+
try {
|
|
66
|
+
content = await readFile(filePath, 'utf8');
|
|
67
|
+
} catch (err) {
|
|
68
|
+
process.stderr.write(`[markshift] error: ${err.message}\n`);
|
|
69
|
+
process.exit(1);
|
|
70
|
+
}
|
|
71
|
+
const ext = extname(filePath).toLowerCase();
|
|
72
|
+
|
|
73
|
+
const supportedExts = new Set(['.md', '.markdown', '.html', '.htm']);
|
|
74
|
+
if (!supportedExts.has(ext)) {
|
|
75
|
+
process.stderr.write(`[markshift] route: unsupported file type "${ext}", skipping ${filePath}\n`);
|
|
76
|
+
return;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const contextType = detectContextType(filePath, content);
|
|
80
|
+
|
|
81
|
+
if (contextType === 'human') {
|
|
82
|
+
// HTML input stays HTML; MD input converts to HTML
|
|
83
|
+
const html = (ext === '.html' || ext === '.htm')
|
|
84
|
+
? content
|
|
85
|
+
: htmlFromMd(content);
|
|
86
|
+
process.stdout.write(html);
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// agent → MD
|
|
91
|
+
const md = (ext === '.md' || ext === '.markdown')
|
|
92
|
+
? content
|
|
93
|
+
: mdFromHtml(content);
|
|
94
|
+
process.stdout.write(md + '\n');
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Route all matching files in a directory.
|
|
99
|
+
* @param {string} dir
|
|
100
|
+
* @param {{ recursive?: boolean }} opts
|
|
101
|
+
*/
|
|
102
|
+
export async function routeDir(dir, opts = {}) {
|
|
103
|
+
let entries;
|
|
104
|
+
try {
|
|
105
|
+
entries = await readdir(dir);
|
|
106
|
+
} catch (err) {
|
|
107
|
+
process.stderr.write(`[markshift] error reading directory ${dir}: ${err.message}\n`);
|
|
108
|
+
return;
|
|
109
|
+
}
|
|
110
|
+
for (const entry of entries) {
|
|
111
|
+
if (entry.startsWith('.') || entry === 'node_modules') continue;
|
|
112
|
+
const fullPath = join(dir, entry);
|
|
113
|
+
let info;
|
|
114
|
+
try {
|
|
115
|
+
info = await stat(fullPath);
|
|
116
|
+
} catch (err) {
|
|
117
|
+
process.stderr.write(`[markshift] error: ${err.message}\n`);
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
if (info.isDirectory()) {
|
|
121
|
+
if (opts.recursive) await routeDir(fullPath, opts);
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
const ext = extname(entry).toLowerCase();
|
|
125
|
+
if (ext === '.md' || ext === '.markdown' || ext === '.html' || ext === '.htm') {
|
|
126
|
+
try {
|
|
127
|
+
await route(fullPath);
|
|
128
|
+
} catch (err) {
|
|
129
|
+
process.stderr.write(`[markshift] error routing ${fullPath}: ${err.message}\n`);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
// Markdown → structured HTML transform
|
|
2
|
+
//
|
|
3
|
+
// --optimize-tokens mode:
|
|
4
|
+
// - Strips HTML comments and script/style blocks
|
|
5
|
+
// - Collapses inter-tag whitespace
|
|
6
|
+
// - Extracts repeated inline styles into a <style> block
|
|
7
|
+
//
|
|
8
|
+
// Output written to stdout by default.
|
|
9
|
+
|
|
10
|
+
import { readFile } from 'node:fs/promises';
|
|
11
|
+
import { marked } from 'marked';
|
|
12
|
+
import { optimizeHtml } from '../optimizer.js';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Pure transform: Markdown string → HTML string.
|
|
16
|
+
* Returns fragment HTML (no <!DOCTYPE>), suitable for agent context.
|
|
17
|
+
* @param {string} content
|
|
18
|
+
* @param {{ optimizeTokens?: boolean }} opts
|
|
19
|
+
* @returns {string}
|
|
20
|
+
*/
|
|
21
|
+
export function htmlFromMd(content, opts = {}) {
|
|
22
|
+
let html = marked(content);
|
|
23
|
+
if (opts.optimizeTokens) html = optimizeHtml(html);
|
|
24
|
+
return html;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* CLI wrapper: reads filePath, transforms, writes to stdout.
|
|
29
|
+
* process.stdout.write — no trailing newline (HTML fragment output for piping).
|
|
30
|
+
* @param {string} filePath
|
|
31
|
+
* @param {{ optimizeTokens?: boolean }} opts
|
|
32
|
+
*/
|
|
33
|
+
export async function toHtml(filePath, opts = {}) {
|
|
34
|
+
let content;
|
|
35
|
+
try {
|
|
36
|
+
content = await readFile(filePath, 'utf8');
|
|
37
|
+
} catch (err) {
|
|
38
|
+
process.stderr.write(`[markshift] error: ${err.message}\n`);
|
|
39
|
+
process.exit(1);
|
|
40
|
+
}
|
|
41
|
+
process.stdout.write(htmlFromMd(content, opts));
|
|
42
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
// HTML → clean Markdown transform
|
|
2
|
+
//
|
|
3
|
+
// Uses `turndown` with these options:
|
|
4
|
+
// - headingStyle: 'atx' (# H1, ## H2, ...)
|
|
5
|
+
// - codeBlockStyle: 'fenced' (``` fences)
|
|
6
|
+
// - bulletListMarker: '-'
|
|
7
|
+
//
|
|
8
|
+
// Post-processing:
|
|
9
|
+
// - Collapse 3+ blank lines → 2
|
|
10
|
+
// - Trim trailing whitespace per line
|
|
11
|
+
// - Strip HTML comments
|
|
12
|
+
//
|
|
13
|
+
// Output written to stdout.
|
|
14
|
+
|
|
15
|
+
import { readFile } from 'node:fs/promises';
|
|
16
|
+
import TurndownService from 'turndown';
|
|
17
|
+
|
|
18
|
+
const td = new TurndownService({
|
|
19
|
+
headingStyle: 'atx',
|
|
20
|
+
codeBlockStyle: 'fenced',
|
|
21
|
+
bulletListMarker: '-',
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Pure transform: HTML string → clean Markdown string.
|
|
26
|
+
* @param {string} html
|
|
27
|
+
* @returns {string}
|
|
28
|
+
*/
|
|
29
|
+
export function mdFromHtml(html) {
|
|
30
|
+
// Strip HTML comments before turndown sees them
|
|
31
|
+
const stripped = html.replace(/<!--[\s\S]*?-->/g, '');
|
|
32
|
+
let md = td.turndown(stripped);
|
|
33
|
+
// Collapse 3+ blank lines → 2, trim trailing whitespace per line
|
|
34
|
+
md = md.replace(/\n{3,}/g, '\n\n').replace(/[ \t]+$/gm, '');
|
|
35
|
+
return md;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* CLI wrapper: reads filePath, transforms, writes to stdout.
|
|
40
|
+
* process.stdout.write — no trailing newline for piped output.
|
|
41
|
+
* @param {string} filePath
|
|
42
|
+
*/
|
|
43
|
+
export async function toMd(filePath) {
|
|
44
|
+
let content;
|
|
45
|
+
try {
|
|
46
|
+
content = await readFile(filePath, 'utf8');
|
|
47
|
+
} catch (err) {
|
|
48
|
+
process.stderr.write(`[markshift] error: ${err.message}\n`);
|
|
49
|
+
process.exit(1);
|
|
50
|
+
}
|
|
51
|
+
process.stdout.write(mdFromHtml(content) + '\n');
|
|
52
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
import { extname } from 'node:path';
|
|
3
|
+
import { marked } from 'marked';
|
|
4
|
+
|
|
5
|
+
const ENTITIES = {
|
|
6
|
+
'&': '&',
|
|
7
|
+
'<': '<',
|
|
8
|
+
'>': '>',
|
|
9
|
+
'"': '"',
|
|
10
|
+
''': "'",
|
|
11
|
+
' ': ' ',
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Decode HTML entities to their character equivalents.
|
|
16
|
+
* @param {string} str
|
|
17
|
+
* @returns {string}
|
|
18
|
+
*/
|
|
19
|
+
function decodeEntities(str) {
|
|
20
|
+
return str.replace(/&[a-z#0-9]+;/gi, e => ENTITIES[e] ?? e);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Word-wrap text to specified column width.
|
|
25
|
+
* Force-breaks words that exceed width.
|
|
26
|
+
* @param {string} text
|
|
27
|
+
* @param {number} width
|
|
28
|
+
* @returns {string}
|
|
29
|
+
*/
|
|
30
|
+
function wordWrap(text, width = 80) {
|
|
31
|
+
return text.split('\n').map(line => {
|
|
32
|
+
if (line.length <= width) return line;
|
|
33
|
+
const words = line.split(' ');
|
|
34
|
+
const out = [];
|
|
35
|
+
let cur = '';
|
|
36
|
+
for (const word of words) {
|
|
37
|
+
// Force-break words that exceed width on their own
|
|
38
|
+
const chunks = word.match(new RegExp(`.{1,${width}}`, 'g')) ?? [word];
|
|
39
|
+
for (const chunk of chunks) {
|
|
40
|
+
const next = cur ? `${cur} ${chunk}` : chunk;
|
|
41
|
+
if (next.length > width) { if (cur) out.push(cur); cur = chunk; }
|
|
42
|
+
else cur = next;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (cur) out.push(cur);
|
|
46
|
+
return out.join('\n');
|
|
47
|
+
}).join('\n');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Pure transform: HTML or Markdown string → terminal-safe plain text.
|
|
52
|
+
* Strips tags, decodes entities, normalizes whitespace, wraps to 80 columns.
|
|
53
|
+
* @param {string} content
|
|
54
|
+
* @param {string} ext File extension including dot, e.g. '.md' or '.html'
|
|
55
|
+
* @returns {string}
|
|
56
|
+
*/
|
|
57
|
+
export function textFrom(content, ext) {
|
|
58
|
+
let html = content;
|
|
59
|
+
|
|
60
|
+
// Convert Markdown to HTML if needed
|
|
61
|
+
if (ext === '.md' || ext === '.markdown') {
|
|
62
|
+
html = marked(content);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Strip script/style blocks (content would otherwise appear as plain text)
|
|
66
|
+
html = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ');
|
|
67
|
+
html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ');
|
|
68
|
+
|
|
69
|
+
// Strip HTML tags
|
|
70
|
+
let text = html.replace(/<[^>]+>/gs, ' ');
|
|
71
|
+
|
|
72
|
+
// Decode HTML entities
|
|
73
|
+
text = decodeEntities(text);
|
|
74
|
+
|
|
75
|
+
// Normalize whitespace: collapse runs, trim lines, collapse excessive newlines
|
|
76
|
+
text = text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
|
|
77
|
+
|
|
78
|
+
// Word-wrap to 80 columns
|
|
79
|
+
return wordWrap(text);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* CLI wrapper: reads filePath, transforms, writes to stdout.
|
|
84
|
+
* process.stdout.write with trailing newline (POSIX convention for text output).
|
|
85
|
+
* @param {string} filePath
|
|
86
|
+
*/
|
|
87
|
+
export async function toText(filePath) {
|
|
88
|
+
let content;
|
|
89
|
+
try {
|
|
90
|
+
content = await readFile(filePath, 'utf8');
|
|
91
|
+
} catch (err) {
|
|
92
|
+
process.stderr.write(`[markshift] error: ${err.message}\n`);
|
|
93
|
+
process.exit(1);
|
|
94
|
+
}
|
|
95
|
+
const ext = extname(filePath).toLowerCase();
|
|
96
|
+
process.stdout.write(textFrom(content, ext) + '\n');
|
|
97
|
+
}
|
package/src/watcher.js
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
// Directory watcher — auto-routes files on save
|
|
2
|
+
//
|
|
3
|
+
// Uses `chokidar` (cross-platform fs watch with debounce).
|
|
4
|
+
//
|
|
5
|
+
// Behavior:
|
|
6
|
+
// - Watches *.md and *.html files in the target directory (recursive)
|
|
7
|
+
// - On change: calls router.route(filePath)
|
|
8
|
+
// - Debounce: 300ms (avoid double-fire on editor atomic saves)
|
|
9
|
+
// - Prints a log line per event: [HH:MM:SS] routed path/to/file
|
|
10
|
+
// - Ignores: node_modules, .git, hidden files
|
|
11
|
+
//
|
|
12
|
+
// Runs until SIGINT (Ctrl+C).
|
|
13
|
+
|
|
14
|
+
import chokidar from 'chokidar';
|
|
15
|
+
import { route } from './router.js';
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Watch a directory and auto-route files on save.
|
|
19
|
+
* Logs events to stderr (not stdout) so transform output stays clean for piping.
|
|
20
|
+
* @param {string} dir
|
|
21
|
+
* @param {{ ext?: string }} opts ext = comma-separated extensions, e.g. 'md,html'
|
|
22
|
+
*/
|
|
23
|
+
export async function watch(dir, opts = {}) {
|
|
24
|
+
const extensions = opts.ext
|
|
25
|
+
? opts.ext.split(',').map(e => e.trim().replace(/^\./, ''))
|
|
26
|
+
: ['md', 'html'];
|
|
27
|
+
|
|
28
|
+
const normalizedDir = dir.replace(/\\/g, '/');
|
|
29
|
+
const patterns = extensions.map(e => `${normalizedDir}/**/*.${e}`);
|
|
30
|
+
|
|
31
|
+
const watcher = chokidar.watch(patterns, {
|
|
32
|
+
ignored: [/node_modules/, /\.git/, /(^|[/\\])\../],
|
|
33
|
+
persistent: true,
|
|
34
|
+
ignoreInitial: true,
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
const debounceMap = new Map();
|
|
38
|
+
|
|
39
|
+
watcher.on('change', (filePath) => {
|
|
40
|
+
if (debounceMap.has(filePath)) clearTimeout(debounceMap.get(filePath));
|
|
41
|
+
debounceMap.set(filePath, setTimeout(async () => {
|
|
42
|
+
debounceMap.delete(filePath);
|
|
43
|
+
const time = new Date().toTimeString().slice(0, 8);
|
|
44
|
+
try {
|
|
45
|
+
await route(filePath);
|
|
46
|
+
process.stderr.write(`[${time}] routed ${filePath}\n`);
|
|
47
|
+
} catch (err) {
|
|
48
|
+
process.stderr.write(`[${time}] error ${filePath} — ${err.message}\n`);
|
|
49
|
+
}
|
|
50
|
+
}, 300));
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
process.once('SIGINT', () => {
|
|
54
|
+
for (const id of debounceMap.values()) clearTimeout(id);
|
|
55
|
+
debounceMap.clear();
|
|
56
|
+
watcher.close().then(() => process.exit(0));
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
process.stderr.write(`watching ${normalizedDir} for ${extensions.map(e => `*.${e}`).join(', ')} changes...\n`);
|
|
60
|
+
}
|