into-md 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/CLAUDE.md +251 -0
- package/.claude/settings.json +15 -0
- package/.claude/settings.local.json +9 -0
- package/.cursor/hooks.json +10 -0
- package/.vscode/settings.json +53 -0
- package/AGENTS.md +284 -0
- package/CLAUDE.md +111 -0
- package/GEMINI.md +123 -0
- package/README.md +133 -0
- package/biome.jsonc +4 -0
- package/bun.lock +413 -0
- package/dist/index.d.mts +3 -0
- package/dist/index.mjs +446 -0
- package/dist/index.mjs.map +1 -0
- package/docs/SPEC.md +201 -0
- package/package.json +39 -0
- package/src/cache.ts +79 -0
- package/src/converter.ts +96 -0
- package/src/extractor.ts +85 -0
- package/src/fetcher.ts +236 -0
- package/src/images.ts +27 -0
- package/src/index.ts +143 -0
- package/src/metadata.ts +30 -0
- package/src/tables.ts +80 -0
- package/src/types/jsdom.d.ts +10 -0
- package/src/utils.ts +28 -0
- package/tsconfig.json +29 -0
- package/tsdown.config.ts +14 -0
package/CLAUDE.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Use Bun instead of Node.js, npm, pnpm, or vite.
|
|
3
|
+
globs: "*.ts, *.tsx, *.html, *.css, *.js, *.jsx, package.json"
|
|
4
|
+
alwaysApply: false
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
Default to using Bun instead of Node.js.
|
|
8
|
+
|
|
9
|
+
- Use `bun <file>` instead of `node <file>` or `ts-node <file>`
|
|
10
|
+
- Use `bun test` instead of `jest` or `vitest`
|
|
11
|
+
- Use `bun build <file.html|file.ts|file.css>` instead of `webpack` or `esbuild`
|
|
12
|
+
- Use `bun install` instead of `npm install` or `yarn install` or `pnpm install`
|
|
13
|
+
- Use `bun run <script>` instead of `npm run <script>` or `yarn run <script>` or `pnpm run <script>`
|
|
14
|
+
- Use `bunx <package> <command>` instead of `npx <package> <command>`
|
|
15
|
+
- Bun automatically loads .env, so don't use dotenv.
|
|
16
|
+
|
|
17
|
+
## APIs
|
|
18
|
+
|
|
19
|
+
- `Bun.serve()` supports WebSockets, HTTPS, and routes. Don't use `express`.
|
|
20
|
+
- `bun:sqlite` for SQLite. Don't use `better-sqlite3`.
|
|
21
|
+
- `Bun.redis` for Redis. Don't use `ioredis`.
|
|
22
|
+
- `Bun.sql` for Postgres. Don't use `pg` or `postgres.js`.
|
|
23
|
+
- `WebSocket` is built-in. Don't use `ws`.
|
|
24
|
+
- Prefer `Bun.file` over `node:fs`'s readFile/writeFile
|
|
25
|
+
- Bun.$`ls` instead of execa.
|
|
26
|
+
|
|
27
|
+
## Testing
|
|
28
|
+
|
|
29
|
+
Use `bun test` to run tests.
|
|
30
|
+
|
|
31
|
+
```ts#index.test.ts
|
|
32
|
+
import { test, expect } from "bun:test";
|
|
33
|
+
|
|
34
|
+
test("hello world", () => {
|
|
35
|
+
expect(1).toBe(1);
|
|
36
|
+
});
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Frontend
|
|
40
|
+
|
|
41
|
+
Use HTML imports with `Bun.serve()`. Don't use `vite`. HTML imports fully support React, CSS, Tailwind.
|
|
42
|
+
|
|
43
|
+
Server:
|
|
44
|
+
|
|
45
|
+
```ts#index.ts
|
|
46
|
+
import index from "./index.html"
|
|
47
|
+
|
|
48
|
+
Bun.serve({
|
|
49
|
+
routes: {
|
|
50
|
+
"/": index,
|
|
51
|
+
"/api/users/:id": {
|
|
52
|
+
GET: (req) => {
|
|
53
|
+
return new Response(JSON.stringify({ id: req.params.id }));
|
|
54
|
+
},
|
|
55
|
+
},
|
|
56
|
+
},
|
|
57
|
+
// optional websocket support
|
|
58
|
+
websocket: {
|
|
59
|
+
open: (ws) => {
|
|
60
|
+
ws.send("Hello, world!");
|
|
61
|
+
},
|
|
62
|
+
message: (ws, message) => {
|
|
63
|
+
ws.send(message);
|
|
64
|
+
},
|
|
65
|
+
close: (ws) => {
|
|
66
|
+
// handle close
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
development: {
|
|
70
|
+
hmr: true,
|
|
71
|
+
console: true,
|
|
72
|
+
}
|
|
73
|
+
})
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
HTML files can import .tsx, .jsx or .js files directly and Bun's bundler will transpile & bundle automatically. `<link>` tags can point to stylesheets and Bun's CSS bundler will bundle.
|
|
77
|
+
|
|
78
|
+
```html#index.html
|
|
79
|
+
<html>
|
|
80
|
+
<body>
|
|
81
|
+
<h1>Hello, world!</h1>
|
|
82
|
+
<script type="module" src="./frontend.tsx"></script>
|
|
83
|
+
</body>
|
|
84
|
+
</html>
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
With the following `frontend.tsx`:
|
|
88
|
+
|
|
89
|
+
```tsx#frontend.tsx
|
|
90
|
+
import React from "react";
|
|
91
|
+
import { createRoot } from "react-dom/client";
|
|
92
|
+
|
|
93
|
+
// import .css files directly and it works
|
|
94
|
+
import './index.css';
|
|
95
|
+
|
|
96
|
+
const root = createRoot(document.body);
|
|
97
|
+
|
|
98
|
+
export default function Frontend() {
|
|
99
|
+
return <h1>Hello, world!</h1>;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
root.render(<Frontend />);
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Then, run index.ts
|
|
106
|
+
|
|
107
|
+
```sh
|
|
108
|
+
bun --hot ./index.ts
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
For more information, read the Bun API docs in `node_modules/bun-types/docs/**.mdx`.
|
package/GEMINI.md
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# Ultracite Code Standards
|
|
2
|
+
|
|
3
|
+
This project uses **Ultracite**, a zero-config preset that enforces strict code quality standards through automated formatting and linting.
|
|
4
|
+
|
|
5
|
+
## Quick Reference
|
|
6
|
+
|
|
7
|
+
- **Format code**: `bunx ultracite fix`
|
|
8
|
+
- **Check for issues**: `bunx ultracite check`
|
|
9
|
+
- **Diagnose setup**: `bunx ultracite doctor`
|
|
10
|
+
|
|
11
|
+
Biome (the underlying engine) provides robust linting and formatting. Most issues are automatically fixable.
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Core Principles
|
|
16
|
+
|
|
17
|
+
Write code that is **accessible, performant, type-safe, and maintainable**. Focus on clarity and explicit intent over brevity.
|
|
18
|
+
|
|
19
|
+
### Type Safety & Explicitness
|
|
20
|
+
|
|
21
|
+
- Use explicit types for function parameters and return values when they enhance clarity
|
|
22
|
+
- Prefer `unknown` over `any` when the type is genuinely unknown
|
|
23
|
+
- Use const assertions (`as const`) for immutable values and literal types
|
|
24
|
+
- Leverage TypeScript's type narrowing instead of type assertions
|
|
25
|
+
- Use meaningful variable names instead of magic numbers - extract constants with descriptive names
|
|
26
|
+
|
|
27
|
+
### Modern JavaScript/TypeScript
|
|
28
|
+
|
|
29
|
+
- Use arrow functions for callbacks and short functions
|
|
30
|
+
- Prefer `for...of` loops over `.forEach()` and indexed `for` loops
|
|
31
|
+
- Use optional chaining (`?.`) and nullish coalescing (`??`) for safer property access
|
|
32
|
+
- Prefer template literals over string concatenation
|
|
33
|
+
- Use destructuring for object and array assignments
|
|
34
|
+
- Use `const` by default, `let` only when reassignment is needed, never `var`
|
|
35
|
+
|
|
36
|
+
### Async & Promises
|
|
37
|
+
|
|
38
|
+
- Always `await` promises in async functions - don't forget to use the return value
|
|
39
|
+
- Use `async/await` syntax instead of promise chains for better readability
|
|
40
|
+
- Handle errors appropriately in async code with try-catch blocks
|
|
41
|
+
- Don't use async functions as Promise executors
|
|
42
|
+
|
|
43
|
+
### React & JSX
|
|
44
|
+
|
|
45
|
+
- Use function components over class components
|
|
46
|
+
- Call hooks at the top level only, never conditionally
|
|
47
|
+
- Specify all dependencies in hook dependency arrays correctly
|
|
48
|
+
- Use the `key` prop for elements in iterables (prefer unique IDs over array indices)
|
|
49
|
+
- Nest children between opening and closing tags instead of passing as props
|
|
50
|
+
- Don't define components inside other components
|
|
51
|
+
- Use semantic HTML and ARIA attributes for accessibility:
|
|
52
|
+
- Provide meaningful alt text for images
|
|
53
|
+
- Use proper heading hierarchy
|
|
54
|
+
- Add labels for form inputs
|
|
55
|
+
- Include keyboard event handlers alongside mouse events
|
|
56
|
+
- Use semantic elements (`<button>`, `<nav>`, etc.) instead of divs with roles
|
|
57
|
+
|
|
58
|
+
### Error Handling & Debugging
|
|
59
|
+
|
|
60
|
+
- Remove `console.log`, `debugger`, and `alert` statements from production code
|
|
61
|
+
- Throw `Error` objects with descriptive messages, not strings or other values
|
|
62
|
+
- Use `try-catch` blocks meaningfully - don't catch errors just to rethrow them
|
|
63
|
+
- Prefer early returns over nested conditionals for error cases
|
|
64
|
+
|
|
65
|
+
### Code Organization
|
|
66
|
+
|
|
67
|
+
- Keep functions focused and under reasonable cognitive complexity limits
|
|
68
|
+
- Extract complex conditions into well-named boolean variables
|
|
69
|
+
- Use early returns to reduce nesting
|
|
70
|
+
- Prefer simple conditionals over nested ternary operators
|
|
71
|
+
- Group related code together and separate concerns
|
|
72
|
+
|
|
73
|
+
### Security
|
|
74
|
+
|
|
75
|
+
- Add `rel="noopener"` when using `target="_blank"` on links
|
|
76
|
+
- Avoid `dangerouslySetInnerHTML` unless absolutely necessary
|
|
77
|
+
- Don't use `eval()` or assign directly to `document.cookie`
|
|
78
|
+
- Validate and sanitize user input
|
|
79
|
+
|
|
80
|
+
### Performance
|
|
81
|
+
|
|
82
|
+
- Avoid spread syntax in accumulators within loops
|
|
83
|
+
- Use top-level regex literals instead of creating them in loops
|
|
84
|
+
- Prefer specific imports over namespace imports
|
|
85
|
+
- Avoid barrel files (index files that re-export everything)
|
|
86
|
+
- Use proper image components (e.g., Next.js `<Image>`) over `<img>` tags
|
|
87
|
+
|
|
88
|
+
### Framework-Specific Guidance
|
|
89
|
+
|
|
90
|
+
**Next.js:**
|
|
91
|
+
- Use Next.js `<Image>` component for images
|
|
92
|
+
- Use `next/head` or App Router metadata API for head elements
|
|
93
|
+
- Use Server Components for async data fetching instead of async Client Components
|
|
94
|
+
|
|
95
|
+
**React 19+:**
|
|
96
|
+
- Use ref as a prop instead of `React.forwardRef`
|
|
97
|
+
|
|
98
|
+
**Solid/Svelte/Vue/Qwik:**
|
|
99
|
+
- Use `class` and `for` attributes (not `className` or `htmlFor`)
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Testing
|
|
104
|
+
|
|
105
|
+
- Write assertions inside `it()` or `test()` blocks
|
|
106
|
+
- Avoid done callbacks in async tests - use async/await instead
|
|
107
|
+
- Don't use `.only` or `.skip` in committed code
|
|
108
|
+
- Keep test suites reasonably flat - avoid excessive `describe` nesting
|
|
109
|
+
|
|
110
|
+
## When Biome Can't Help
|
|
111
|
+
|
|
112
|
+
Biome's linter will catch most issues automatically. Focus your attention on:
|
|
113
|
+
|
|
114
|
+
1. **Business logic correctness** - Biome can't validate your algorithms
|
|
115
|
+
2. **Meaningful naming** - Use descriptive names for functions, variables, and types
|
|
116
|
+
3. **Architecture decisions** - Component structure, data flow, and API design
|
|
117
|
+
4. **Edge cases** - Handle boundary conditions and error states
|
|
118
|
+
5. **User experience** - Accessibility, performance, and usability considerations
|
|
119
|
+
6. **Documentation** - Add comments for complex logic, but prefer self-documenting code
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
Most formatting and common issues are automatically fixed by Biome. Run `bun x ultracite fix` before committing to ensure compliance.
|
package/README.md
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# into-md
|
|
2
|
+
|
|
3
|
+
A CLI tool that fetches web pages and converts them to clean markdown, optimized for providing context to LLMs.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Local development
|
|
9
|
+
bun install
|
|
10
|
+
|
|
11
|
+
# Global install (from npm registry)
|
|
12
|
+
bun add -g into-md
|
|
13
|
+
# or
|
|
14
|
+
npm install -g into-md
|
|
15
|
+
# or
|
|
16
|
+
yarn global add into-md
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# If installed globakky
|
|
23
|
+
into-md <url>
|
|
24
|
+
|
|
25
|
+
# If installed locally
|
|
26
|
+
bun run into-md <url>
|
|
27
|
+
```
|
|
28
|
+
### Examples
|
|
29
|
+
```bash
|
|
30
|
+
# Use headless browser for JS-rendered content
|
|
31
|
+
into-md https://spa-site.com/page --js
|
|
32
|
+
|
|
33
|
+
# Skip content extraction, convert full page
|
|
34
|
+
into-md https://example.com --raw
|
|
35
|
+
|
|
36
|
+
# With authentication cookies
|
|
37
|
+
into-md https://private-site.com/page --cookies cookies.txt
|
|
38
|
+
|
|
39
|
+
# Verbose output
|
|
40
|
+
into-md https://example.com/article -v
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Options
|
|
44
|
+
|
|
45
|
+
| Flag | Description | Default |
|
|
46
|
+
| ----------------------- | --------------------------------------------------------- | --------------- |
|
|
47
|
+
| `-o, --output <file>` | Write output to file instead of stdout | stdout |
|
|
48
|
+
| `--js` | Use headless browser (Playwright) for JS-rendered content | disabled |
|
|
49
|
+
| `--raw` | Skip content extraction, convert entire HTML | disabled |
|
|
50
|
+
| `--cookies <file>` | Path to cookies file for authenticated requests | none |
|
|
51
|
+
| `--user-agent <string>` | Custom User-Agent header | browser-like UA |
|
|
52
|
+
| `--encoding <encoding>` | Force character encoding (auto-detected by default) | auto |
|
|
53
|
+
| `--strip-links` | Remove hyperlinks, keep only anchor text | disabled |
|
|
54
|
+
| `--exclude <selectors>` | CSS selectors to exclude (comma-separated) | none |
|
|
55
|
+
| `--timeout <ms>` | Request timeout in milliseconds | 30000 |
|
|
56
|
+
| `--no-cache` | Bypass response cache | cache enabled |
|
|
57
|
+
| `-v, --verbose` | Show detailed progress information | minimal |
|
|
58
|
+
| `-h, --help` | Show help | - |
|
|
59
|
+
| `--version` | Show version | - |
|
|
60
|
+
|
|
61
|
+
## Output Format
|
|
62
|
+
|
|
63
|
+
### Frontmatter
|
|
64
|
+
|
|
65
|
+
Standard metadata is included as YAML frontmatter:
|
|
66
|
+
|
|
67
|
+
```yaml
|
|
68
|
+
---
|
|
69
|
+
title: "Article Title"
|
|
70
|
+
description: "Meta description from the page"
|
|
71
|
+
author: "Author Name"
|
|
72
|
+
date: "2024-01-15"
|
|
73
|
+
source: "https://example.com/article"
|
|
74
|
+
---
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Tables
|
|
78
|
+
|
|
79
|
+
Tables are converted to fenced JSON blocks for reliable LLM parsing:
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"caption": "Quarterly Revenue",
|
|
84
|
+
"headers": ["Quarter", "Revenue", "Growth"],
|
|
85
|
+
"rows": [
|
|
86
|
+
{ "Quarter": "Q1", "Revenue": "$1.2M", "Growth": "12%" },
|
|
87
|
+
{ "Quarter": "Q2", "Revenue": "$1.5M", "Growth": "25%" }
|
|
88
|
+
]
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Development
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
bun run build # Build the CLI
|
|
96
|
+
bun run build:watch # Build with watch mode
|
|
97
|
+
bun run test # Run tests
|
|
98
|
+
bun run lint # Check for lint errors
|
|
99
|
+
bun run fix # Auto-fix lint errors
|
|
100
|
+
bun run typecheck # Type check
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Technical Stack
|
|
104
|
+
|
|
105
|
+
- **Runtime**: Bun
|
|
106
|
+
- **Language**: TypeScript
|
|
107
|
+
- **HTML Parsing**: cheerio
|
|
108
|
+
- **Markdown Conversion**: turndown
|
|
109
|
+
- **Content Extraction**: @mozilla/readability
|
|
110
|
+
- **Headless Browser**: playwright (for `--js` mode)
|
|
111
|
+
- **CLI Framework**: commander
|
|
112
|
+
|
|
113
|
+
## Project Structure
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
into-md/
|
|
117
|
+
├── src/
|
|
118
|
+
│ ├── index.ts # CLI entry point
|
|
119
|
+
│ ├── fetcher.ts # URL fetching (static + headless)
|
|
120
|
+
│ ├── extractor.ts # Content extraction with readability
|
|
121
|
+
│ ├── converter.ts # HTML to markdown conversion
|
|
122
|
+
│ ├── tables.ts # Table to JSON conversion
|
|
123
|
+
│ ├── images.ts # Image context extraction
|
|
124
|
+
│ ├── metadata.ts # Frontmatter generation
|
|
125
|
+
│ └── cache.ts # Response caching
|
|
126
|
+
├── docs/
|
|
127
|
+
│ └── SPEC.md # Full specification
|
|
128
|
+
└── package.json
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT
|
package/biome.jsonc
ADDED