@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Steph Ango (@kepano)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,371 @@
1
+ > de·​fud·dle /diˈfʌdl/ *transitive verb*
2
+ > to remove unnecessary elements from a web page, and make it easily readable.
3
+
4
+ **Beware! Defuddle is very much a work in progress!**
5
+
6
+ Defuddle extracts the main content from web pages. It cleans up web pages by removing clutter like comments, sidebars, headers, footers, and other non-essential elements, leaving only the primary content.
7
+
8
+ ## Overview
9
+
10
+ Defuddle takes a URL or HTML, finds the main content, and returns cleaned HTML or Markdown. Defuddle was created for the browser extension [Obsidian Web Clipper](https://github.com/obsidianmd/obsidian-clipper), but it is designed to run in any environment.
11
+
12
+ Defuddle can be used as a replacement for [Mozilla Readability](https://github.com/mozilla/readability) with a few differences:
13
+
14
+ - More forgiving, removes fewer uncertain elements.
15
+ - Provides a consistent output for footnotes, math, code blocks, etc.
16
+ - Uses a page's mobile styles to guess at unnecessary elements.
17
+ - Extracts more metadata from the page, including schema.org data.
18
+
19
+ ## Usage
20
+
21
+ ### Browser
22
+
23
+ ```javascript
24
+ import Defuddle from 'defuddle';
25
+
26
+ // Parse the current document
27
+ const defuddle = new Defuddle(document);
28
+ const result = defuddle.parse();
29
+
30
+ // Access the content and metadata
31
+ console.log(result.content);
32
+ console.log(result.title);
33
+ console.log(result.author);
34
+ ```
35
+
36
+ ### Node.js
37
+
38
+ `defuddle/node` accepts a DOM `Document` from any implementation (JSDOM, linkedom, happy-dom, etc.).
39
+
40
+ ```javascript
41
+ import { parseHTML } from 'linkedom';
42
+ import { Defuddle } from 'defuddle/node';
43
+
44
+ const { document } = parseHTML(html);
45
+ const result = await Defuddle(document, 'https://example.com/article', {
46
+ markdown: true
47
+ });
48
+
49
+ console.log(result.content);
50
+ console.log(result.title);
51
+ console.log(result.author);
52
+ ```
53
+
54
+ Or with JSDOM:
55
+
56
+ ```javascript
57
+ import { JSDOM } from 'jsdom';
58
+ import { Defuddle } from 'defuddle/node';
59
+
60
+ const dom = new JSDOM(html, { url: 'https://example.com/article' });
61
+ const result = await Defuddle(dom.window.document, 'https://example.com/article');
62
+ ```
63
+
64
+ _Note: for `defuddle/node` to import properly, the module format in your `package.json` has to be set to `{ "type": "module" }`_
65
+
66
+ ### CLI
67
+
68
+ Defuddle includes a command-line interface for parsing web pages directly from the terminal. You can run it with `npx` or [install it globally](#cli-installation).
69
+
70
+ ```bash
71
+ # Parse a local HTML file
72
+ npx defuddle parse page.html
73
+
74
+ # Parse a URL
75
+ npx defuddle parse https://example.com/article
76
+
77
+ # Output as markdown
78
+ npx defuddle parse page.html --markdown
79
+
80
+ # Output as JSON with metadata
81
+ npx defuddle parse page.html --json
82
+
83
+ # Extract a specific property
84
+ npx defuddle parse page.html --property title
85
+
86
+ # Save output to a file
87
+ npx defuddle parse page.html --output result.html
88
+
89
+ # Enable debug mode
90
+ npx defuddle parse page.html --debug
91
+ ```
92
+
93
+ #### CLI Options
94
+
95
+ | Option | Alias | Description |
96
+ |--------|-------|-------------|
97
+ | `--output <file>` | `-o` | Write output to a file instead of stdout |
98
+ | `--markdown` | `-m` | Convert content to markdown format |
99
+ | `--md` | | Alias for `--markdown` |
100
+ | `--json` | `-j` | Output as JSON with metadata and content |
101
+ | `--property <name>` | `-p` | Extract a specific property (e.g., title, description, domain) |
102
+ | `--debug` | | Enable debug mode |
103
+ | `--lang <code>` | `-l` | Preferred language (BCP 47, e.g. `en`, `fr`, `ja`) |
104
+
105
+ #### Proxy support
106
+
107
+ Defuddle CLI supports HTTP/HTTPS proxies via the `DEFUDDLE_PROXY` environment variable:
108
+
109
+ ```bash
110
+ # Set proxy via environment variable
111
+ export DEFUDDLE_PROXY=http://proxy.example.com:8080
112
+
113
+ # Or use with authentication
114
+ export DEFUDDLE_PROXY=http://username:password@proxy.example.com:8080
115
+
116
+ # Parse URL through proxy
117
+ npx defuddle parse https://example.com/article
118
+ ```
119
+
120
+ Proxy errors are silently ignored unless `--debug` is enabled, in which case fallback to direct connection is logged.
121
+
122
+ ## Installation
123
+
124
+ ```bash
125
+ npm install defuddle
126
+ ```
127
+
128
+ For Node.js usage, install a DOM implementation:
129
+
130
+ ```bash
131
+ npm install linkedom
132
+ ```
133
+
134
+ Or use JSDOM:
135
+
136
+ ```bash
137
+ npm install jsdom
138
+ ```
139
+
140
+ ### CLI installation
141
+
142
+ To use the `defuddle` command globally, install it with the `-g` flag:
143
+
144
+ ```bash
145
+ npm install -g defuddle
146
+ ```
147
+
148
+ Or use `npx` to run the CLI without installing globally:
149
+
150
+ ```bash
151
+ npx defuddle parse https://example.com/article
152
+ ```
153
+
154
+ ## Response
155
+
156
+ Defuddle returns an object with the following properties:
157
+
158
+ | Property | Type | Description |
159
+ |----------|------|-------------|
160
+ | `author` | string | Author of the article |
161
+ | `content` | string | Cleaned up string of the extracted content |
162
+ | `description` | string | Description or summary of the article |
163
+ | `domain` | string | Domain name of the website |
164
+ | `favicon` | string | URL of the website's favicon |
165
+ | `image` | string | URL of the article's main image |
166
+ | `language` | string | Language of the page in [BCP 47](https://www.rfc-editor.org/info/bcp47) format (e.g. `en`, `en-US`) |
167
+ | `metaTags` | object | Meta tags |
168
+ | `parseTime` | number | Time taken to parse the page in milliseconds |
169
+ | `published` | string | Publication date of the article |
170
+ | `site` | string | Name of the website |
171
+ | `schemaOrgData` | object | Raw schema.org data extracted from the page |
172
+ | `title` | string | Title of the article |
173
+ | `wordCount` | number | Total number of words in the extracted content |
174
+ | `debug` | object | Debug info including content selector and removals (when `debug: true`) |
175
+
176
+ ## Bundles
177
+
178
+ Defuddle is available in three different bundles:
179
+
180
+ 1. Core bundle (`defuddle`): The main bundle for browser usage. No dependencies.
181
+ 2. Full bundle (`defuddle/full`): Includes additional features for math equation parsing and Markdown conversion.
182
+ 3. Node.js bundle (`defuddle/node`): For Node.js environments. Accepts any DOM `Document` (e.g. from linkedom, JSDOM, or happy-dom). Includes full capabilities for math and Markdown conversion.
183
+
184
+ The core bundle is recommended for most use cases. It still handles math content, but doesn't include fallbacks for converting between MathML and LaTeX formats. The full bundle adds the ability to create reliable `<math>` elements using `mathml-to-latex` and `temml` libraries.
185
+
186
+ ## Options
187
+
188
+ | Option | Type | Default | Description |
189
+ | ------------------------ | ------- | ------- | ------------------------------------------------------------------------- |
190
+ | `debug` | boolean | false | Enable debug logging and return debug info in the response |
191
+ | `url` | string | | URL of the page being parsed |
192
+ | `markdown` | boolean | false | Convert `content` to Markdown |
193
+ | `separateMarkdown` | boolean | false | Keep `content` as HTML and return `contentMarkdown` as Markdown |
194
+ | `removeExactSelectors` | boolean | true | Remove elements matching exact selectors like ads, social buttons, etc. |
195
+ | `removePartialSelectors` | boolean | true | Remove elements matching partial selectors like ads, social buttons, etc. |
196
+ | `removeHiddenElements` | boolean | true | Remove elements hidden via CSS (display:none, visibility:hidden, etc.) |
197
+ | `removeLowScoring` | boolean | true | Remove non-content blocks by scoring (navigation, link lists, etc.) |
198
+ | `removeSmallImages` | boolean | true | Remove small images (icons, tracking pixels, etc.) |
199
+ | `removeImages` | boolean | false | Remove images. |
200
+ | `standardize` | boolean | true | Standardize HTML (footnotes, headings, code blocks, etc.) |
201
+ | `contentSelector` | string | | CSS selector to use as the main content element, bypassing auto-detection |
202
+ | `useAsync` | boolean | true | Allow async extractors to fetch from third-party APIs when no local content is available. |
203
+ | `language` | string | | Preferred language (BCP 47 tag, e.g. `en`, `fr`). Sets `Accept-Language` header and selects transcript language. |
204
+ | `includeReplies` | boolean \| 'extractors' | 'extractors' | Include replies: `'extractors'` for site-specific extractors only, `true` for all, `false` for none. |
205
+
206
+ ## HTML standardization
207
+
208
+ Defuddle attempts to standardize HTML elements to provide a consistent input for subsequent manipulation such as conversion to Markdown.
209
+
210
+ ### Headings
211
+
212
+ - The first H1 or H2 heading is removed if it matches the title.
213
+ - H1s are converted to H2s.
214
+ - Anchor links in H1 to H6 elements are removed and become plain headings.
215
+
216
+ ### Code blocks
217
+
218
+ Code block are standardized. If present, line numbers and syntax highlighting are removed, but the language is retained and added as a data attribute and class.
219
+
220
+ ```html
221
+ <pre>
222
+ <code data-lang="js" class="language-js">
223
+ // code
224
+ </code>
225
+ </pre>
226
+ ```
227
+
228
+ ### Footnotes
229
+
230
+ Inline references and footnotes are converted to a standard format:
231
+
232
+ ```html
233
+ Inline reference<sup id="fnref:1"><a href="#fn:1">1</a></sup>.
234
+
235
+ <div id="footnotes">
236
+ <ol>
237
+ <li class="footnote" id="fn:1">
238
+ <p>
239
+ Footnote content.&nbsp;<a href="#fnref:1" class="footnote-backref">↩</a>
240
+ </p>
241
+ </li>
242
+ </ol>
243
+ </div>
244
+ ```
245
+
246
+ ### Math
247
+
248
+ Math elements, including MathJax and KaTeX, are converted to standard MathML:
249
+
250
+ ```html
251
+ <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline" data-latex="a \neq 0">
252
+ <mi>a</mi>
253
+ <mo>≠</mo>
254
+ <mn>0</mn>
255
+ </math>
256
+ ```
257
+
258
+ ### Callouts
259
+
260
+ Callout and alert elements from various sources are standardized to blockquotes with a `data-callout` attribute. When converting to Markdown, these become [Obsidian-style callouts](https://help.obsidian.md/Editing+and+formatting/Callouts).
261
+
262
+ Supported sources:
263
+ - GitHub markdown alerts (`div.markdown-alert`)
264
+ - Obsidian Publish callouts (`div.callout[data-callout]`)
265
+ - Callout asides (`aside.callout-*`)
266
+ - Bootstrap alerts (`div.alert.alert-*`)
267
+
268
+ The standardized HTML follows the [Obsidian Publish](https://help.obsidian.md/Editing+and+formatting/Callouts) format:
269
+
270
+ ```html
271
+ <div data-callout="info" class="callout">
272
+ <div class="callout-title">
273
+ <div class="callout-title-inner">Info</div>
274
+ </div>
275
+ <div class="callout-content">
276
+ <p>This is an informational callout.</p>
277
+ </div>
278
+ </div>
279
+ ```
280
+
281
+ In Markdown:
282
+
283
+ ```markdown
284
+ > [!info] Info
285
+ > This is an informational callout.
286
+ ```
287
+
288
+ ## Development
289
+
290
+ ### Build
291
+
292
+ To build the package, you'll need Node.js and npm installed. Then run:
293
+
294
+ ```bash
295
+ # Install dependencies
296
+ npm install
297
+
298
+ # Clean and build
299
+ npm run build
300
+ ```
301
+
302
+ ## Third-party services
303
+
304
+ When using `parseAsync()`, if no content can be extracted from the local HTML, Defuddle may fetch content from third-party APIs as a fallback. This only happens when the page HTML contains no usable content (e.g. client-side rendered SPAs). You can disable this by setting `useAsync: false` in options.
305
+
306
+ - [FxTwitter API](https://github.com/FixTweet/FxTwitter) — Used to extract X (Twitter) article content, which is not available in server-rendered HTML.
307
+
308
+ ## Debugging
309
+
310
+ ### Debug mode
311
+
312
+ You can enable debug mode by passing an options object when creating a new Defuddle instance:
313
+
314
+ ```typescript
315
+ const result = new Defuddle(document, { debug: true }).parse();
316
+
317
+ // Access debug info
318
+ console.log(result.debug.contentSelector); // CSS selector path of chosen main content element
319
+ console.log(result.debug.removals); // Array of removed elements with reasons
320
+ ```
321
+
322
+ When debug mode is enabled:
323
+
324
+ - Returns a `debug` field in the response with detailed information about content extraction
325
+ - More verbose console logging about the parsing process
326
+ - Preserves HTML class and id attributes that are normally stripped
327
+ - Retains all data-* attributes
328
+ - Skips div flattening to preserve document structure
329
+
330
+ The `debug` field contains:
331
+
332
+ | Property | Type | Description |
333
+ |----------|------|-------------|
334
+ | `contentSelector` | string | CSS selector path of the chosen main content element |
335
+ | `removals` | array | List of elements removed during processing |
336
+
337
+ Each removal entry contains:
338
+
339
+ | Property | Type | Description |
340
+ |----------|------|-------------|
341
+ | `step` | string | Pipeline step that removed the element (e.g. `removeLowScoring`, `removeBySelector`, `removeHiddenElements`) |
342
+ | `selector` | string | CSS selector or pattern that matched (for selector-based removal) |
343
+ | `reason` | string | Why the element was removed (e.g. `score: -20`, `display:none`) |
344
+ | `text` | string | First 200 characters of the removed element's text content |
345
+
346
+ ### Pipeline toggles
347
+
348
+ You can disable individual pipeline steps to diagnose content extraction issues:
349
+
350
+ ```typescript
351
+ // Skip content scoring to see if it's removing content incorrectly
352
+ const result = new Defuddle(document, { removeLowScoring: false }).parse();
353
+
354
+ // Skip hidden element removal (useful for CSS sidenote layouts)
355
+ const result = new Defuddle(document, { removeHiddenElements: false }).parse();
356
+
357
+ // Skip small image removal
358
+ const result = new Defuddle(document, { removeSmallImages: false }).parse();
359
+ ```
360
+
361
+ ### Content selector
362
+
363
+ Use `contentSelector` to bypass Defuddle's auto-detection and specify the main content element directly:
364
+
365
+ ```typescript
366
+ const result = new Defuddle(document, {
367
+ contentSelector: 'article.post-content'
368
+ }).parse();
369
+ ```
370
+
371
+ If the selector doesn't match any element, Defuddle falls back to auto-detection.
package/dist/cli.d.ts ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/cli.js ADDED
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ Object.defineProperty(exports, "__esModule", { value: true });
4
+ const commander_1 = require("commander");
5
+ const node_1 = require("./node");
6
+ const promises_1 = require("fs/promises");
7
+ const path_1 = require("path");
8
+ const linkedom_compat_1 = require("./utils/linkedom-compat");
9
+ const utils_1 = require("./utils");
10
+ const fetch_1 = require("./fetch");
11
+ // ANSI color helpers (avoids chalk dependency which is ESM-only)
12
+ const useColor = process.stdout.isTTY ?? false;
13
+ const ansi = {
14
+ red: (s) => useColor ? `\x1b[31m${s}\x1b[39m` : s,
15
+ green: (s) => useColor ? `\x1b[32m${s}\x1b[39m` : s,
16
+ };
17
+ // Read version from package.json
18
+ const version = require('../package.json').version;
19
+ const program = new commander_1.Command();
20
+ program
21
+ .name('defuddle')
22
+ .description('Extract article content from web pages')
23
+ .version(version);
24
+ program
25
+ .command('parse')
26
+ .description('Parse HTML content from a file or URL')
27
+ .argument('<source>', 'HTML file path or URL to parse')
28
+ .option('-o, --output <file>', 'Output file path (default: stdout)')
29
+ .option('-m, --markdown', 'Convert content to markdown format')
30
+ .option('--md', 'Alias for --markdown')
31
+ .option('-j, --json', 'Output as JSON with metadata and content')
32
+ .option('-p, --property <name>', 'Extract a specific property (e.g., title, description, domain)')
33
+ .option('--debug', 'Enable debug mode')
34
+ .option('-l, --lang <code>', 'Preferred language (BCP 47, e.g. en, fr, ja)')
35
+ .action(async (source, options) => {
36
+ try {
37
+ // Handle --md alias
38
+ if (options.md) {
39
+ options.markdown = true;
40
+ }
41
+ const defuddleOpts = {
42
+ debug: options.debug,
43
+ markdown: options.markdown,
44
+ separateMarkdown: options.markdown || options.json,
45
+ language: options.lang,
46
+ };
47
+ let html;
48
+ let url;
49
+ // Determine if source is a URL or file path
50
+ const isUrl = source.startsWith('http://') || source.startsWith('https://');
51
+ if (isUrl) {
52
+ url = source;
53
+ const initialUA = (0, fetch_1.getInitialUA)(source);
54
+ html = await (0, fetch_1.fetchPage)(source, initialUA, { language: options.lang, debug: options.debug, useProxy: true });
55
+ }
56
+ else {
57
+ const filePath = (0, path_1.resolve)(process.cwd(), source);
58
+ html = await (0, promises_1.readFile)(filePath, 'utf-8');
59
+ }
60
+ const doc = (0, linkedom_compat_1.parseLinkedomHTML)(html);
61
+ let result = await (0, node_1.Defuddle)(doc, url, defuddleOpts);
62
+ // If no content was extracted from a URL, retry with bot UA.
63
+ // Some sites (e.g. Obsidian Publish) serve pre-rendered content to bots.
64
+ if (isUrl && result.wordCount === 0) {
65
+ try {
66
+ const botHtml = await (0, fetch_1.fetchPage)(source, fetch_1.BOT_UA, { language: options.lang, debug: options.debug, useProxy: true });
67
+ // Check for raw markdown before DOM parsing destroys whitespace
68
+ const rawMarkdown = (0, fetch_1.extractRawMarkdown)(botHtml);
69
+ if (rawMarkdown) {
70
+ const botDoc = (0, linkedom_compat_1.parseLinkedomHTML)(botHtml);
71
+ const botResult = await (0, node_1.Defuddle)(botDoc, url, defuddleOpts);
72
+ botResult.content = (0, fetch_1.cleanMarkdownContent)(rawMarkdown);
73
+ botResult.wordCount = (0, utils_1.countWords)(botResult.content);
74
+ result = botResult;
75
+ }
76
+ else {
77
+ const botDoc = (0, linkedom_compat_1.parseLinkedomHTML)(botHtml);
78
+ const botResult = await (0, node_1.Defuddle)(botDoc, url, defuddleOpts);
79
+ if (botResult.wordCount > 0) {
80
+ result = botResult;
81
+ }
82
+ }
83
+ }
84
+ catch {
85
+ // Bot UA may be blocked — use original result
86
+ }
87
+ }
88
+ // Check if parsing produced meaningful content
89
+ const textContent = result.content.replace(/<[^>]*>/g, '').trim();
90
+ if (!textContent) {
91
+ console.error(ansi.red(`Error: No content could be extracted from ${source}`));
92
+ process.exit(1);
93
+ }
94
+ // Format output
95
+ let output;
96
+ if (options.property) {
97
+ const property = options.property;
98
+ if (property in result) {
99
+ output = result[property]?.toString() || '';
100
+ }
101
+ else {
102
+ console.error(ansi.red(`Error: Property "${property}" not found in response`));
103
+ process.exit(1);
104
+ }
105
+ }
106
+ else if (options.json) {
107
+ output = JSON.stringify({
108
+ content: result.content,
109
+ title: result.title,
110
+ description: result.description,
111
+ domain: result.domain,
112
+ favicon: result.favicon,
113
+ image: result.image,
114
+ language: result.language,
115
+ metaTags: result.metaTags,
116
+ parseTime: result.parseTime,
117
+ published: result.published,
118
+ author: result.author,
119
+ site: result.site,
120
+ schemaOrgData: result.schemaOrgData,
121
+ wordCount: result.wordCount,
122
+ ...(result.contentMarkdown ? { contentMarkdown: result.contentMarkdown } : {}),
123
+ ...(result.variables ? { variables: result.variables } : {}),
124
+ }, null, 2);
125
+ }
126
+ else {
127
+ output = result.content;
128
+ }
129
+ // Handle output
130
+ if (options.output) {
131
+ const outputPath = (0, path_1.resolve)(process.cwd(), options.output);
132
+ await (0, promises_1.writeFile)(outputPath, output, 'utf-8');
133
+ console.log(ansi.green(`Output written to ${options.output}`));
134
+ }
135
+ else {
136
+ console.log(output);
137
+ }
138
+ }
139
+ catch (error) {
140
+ console.error(ansi.red('Error:'), error instanceof Error ? error.message : 'Unknown error occurred');
141
+ process.exit(1);
142
+ }
143
+ });
144
+ program.parse();
145
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";;;AAEA,yCAAoC;AACpC,iCAAkC;AAClC,0CAAkD;AAClD,+BAA+B;AAC/B,6DAA4D;AAC5D,mCAAqC;AACrC,mCAAoG;AAYpG,iEAAiE;AACjE,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,IAAI,KAAK,CAAC;AAC/C,MAAM,IAAI,GAAG;IACZ,GAAG,EAAE,CAAC,CAAS,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;IACzD,KAAK,EAAE,CAAC,CAAS,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;CAC3D,CAAC;AAEF,iCAAiC;AACjC,MAAM,OAAO,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAAC,OAAO,CAAC;AAEnD,MAAM,OAAO,GAAG,IAAI,mBAAO,EAAE,CAAC;AAE9B,OAAO;KACL,IAAI,CAAC,UAAU,CAAC;KAChB,WAAW,CAAC,wCAAwC,CAAC;KACrD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEnB,OAAO;KACL,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,uCAAuC,CAAC;KACpD,QAAQ,CAAC,UAAU,EAAE,gCAAgC,CAAC;KACtD,MAAM,CAAC,qBAAqB,EAAE,oCAAoC,CAAC;KACnE,MAAM,CAAC,gBAAgB,EAAE,oCAAoC,CAAC;KAC9D,MAAM,CAAC,MAAM,EAAE,sBAAsB,CAAC;KACtC,MAAM,CAAC,YAAY,EAAE,0CAA0C,CAAC;KAChE,MAAM,CAAC,uBAAuB,EAAE,gEAAgE,CAAC;KACjG,MAAM,CAAC,SAAS,EAAE,mBAAmB,CAAC;KACtC,MAAM,CAAC,mBAAmB,EAAE,8CAA8C,CAAC;KAC3E,MAAM,CAAC,KAAK,EAAE,MAAc,EAAE,OAAqB,EAAE,EAAE;IACvD,IAAI,CAAC;QACJ,oBAAoB;QACpB,IAAI,OAAO,CAAC,EAAE,EAAE,CAAC;YAChB,OAAO,CAAC,QAAQ,GAAG,IAAI,CAAC;QACzB,CAAC;QAED,MAAM,YAAY,GAAG;YACpB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,QAAQ,EAAE,OAAO,CAAC,QAAQ;YAC1B,gBAAgB,EAAE,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI;YAClD,QAAQ,EAAE,OAAO,CAAC,IAAI;SACtB,CAAC;QAEF,IAAI,IAAY,CAAC;QACjB,IAAI,GAAuB,CAAC;QAE5B,4CAA4C;QAC5C,MAAM,KAAK,GAAG,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,MAAM,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC;QAC5E,IAAI,KAAK,EAAE,CAAC;YACX,GAAG,GAAG,MAAM,CAAC;YACb,MAAM,SAAS,GAAG,IAAA,oBAAY,EAAC,MAAM,CAAC,CAAC;YACvC,IAAI,GAAG,MAAM,IAAA,iBAAS,EAAC,MAAM,EAAE,SAAS,EAAE,EAAE,QAAQ,EAAE,OAAO,CAAC,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QAC7G,CAAC;aAAM,CAAC;YACP,MAAM,QAAQ,GAAG,IAAA,cAAO,EAAC,OAAO,CAAC,GAAG,EAAE,EAAE,MAAM,CAAC,CAAC;YAChD,IAAI,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,GAAG,GAAG,IAAA,mCAAiB,EAAC,IAAI,CAAC,CAAC;QACpC,IAAI,MAAM,GAAG,MAAM,IAAA,eAAQ,EAAC,GAAG,EAAE,GAAG,EAAE,YAAY,CAAC,CAAC;QAEpD,6DAA6D;QAC7D,yEAAyE;QACzE,IAAI,KAAK,IAAI,MAAM,CAAC,SAAS,KAAK,CAAC,EAAE,CAAC;YACrC,IAAI,CAAC;gBACJ,MAAM,OAAO,GAAG,MAAM,IAAA,iBAAS,EAAC,MAAM,EAAE,cAAM,EAAE,EAAE,QAAQ,EAAE,OAAO,CAAC,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;gBAElH,gEAAgE;gBAChE,MAAM,WAAW,GAAG,IAAA,0BAAkB,EAAC,OAAO,CAAC,CAAC;gBAChD,IAAI,WAAW,EAAE,CAAC;oBACjB,MAAM,MAAM,GAAG,IAAA,mCAAiB,EAAC,OAAO,CAAC,CAAC;oBAC1C,MAAM,SAAS,GAAG,MAAM,IAAA,eAAQ,EAAC,MAAM,EAAE,GAAG,EAAE,YAAY,CAAC,CAAC;oBAC5D,SAAS,CAAC,OAAO,GAAG,IAAA,4BAAoB,EAAC,WAAW,CAAC,CAAC;oBACtD,SAAS,CAAC,SAAS,GAAG,IAAA,kBAAU,EAAC,SAAS,CAAC,OAAO,CAAC,CAAC;oBACpD,MAAM,GAAG,SAAS,CAAC;gBACpB,CAAC;qBAAM,CAAC;oBACP,MAAM,MAAM,GAAG,IAAA,mCAAiB,EAAC,OAAO,CAAC,CAAC;oBAC1C,MAAM,SAAS,GAAG,MAAM,IAAA,eAAQ,EAAC,MAAM,EAAE,GAAG,EAAE,YAAY,CAAC,CAAC;oBAC5D,IAAI,SAAS,CAAC,SAAS,GAAG,CAAC,EAAE,CAAC;wBAC7B,MAAM,GAAG,SAAS,CAAC;oBACpB,CAAC;gBACF,CAAC;YACF,CAAC;YAAC,MAAM,CAAC;gBACR,8CAA8C;YAC/C,CAAC;QACF,CAAC;QAED,+CAA+C;QAC/C,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QAClE,IAAI,CAAC,WAAW,EAAE,CAAC;YAClB,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,6CAA6C,MAAM,EAAE,CAAC,CAAC,CAAC;YAC/E,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACjB,CAAC;QAED,gBAAgB;QAChB,IAAI,MAAc,CAAC;QAEnB,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;YACtB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;YAClC,IAAI,QAAQ,IAAI,MAAM,EAAE,CAAC;gBACxB,MAAM,GAAG,MAAM,CAAC,QAA+B,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC;YACpE,CAAC;iBAAM,CAAC;gBACP,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,oBAAoB,QAAQ,yBAAyB,CAAC,CAAC,CAAC;gBAC/E,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACjB,CAAC;QACF,CAAC;aAAM,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACzB,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC;gBACvB,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,WAAW,EAAE,MAAM,CAAC,WAAW;gBAC/B,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,QAAQ,EAAE,MAAM,CAAC,QAAQ;gBACzB,QAAQ,EAAE,MAAM,CAAC,QAAQ;gBACzB,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,aAAa,EAAE,MAAM,CAAC,aAAa;gBACnC,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,GAAG,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,CAAC,EAAE,eAAe,EAAE,MAAM,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC9E,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC5D,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;QACb,CAAC;aAAM,CAAC;YACP,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC;QACzB,CAAC;QAED,gBAAgB;QAChB,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;YACpB,MAAM,UAAU,GAAG,IAAA,cAAO,EAAC,OAAO,CAAC,GAAG,EAAE,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;YAC1D,MAAM,IAAA,oBAAS,EAAC,UAAU,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;YAC7C,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,qBAAqB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QAChE,CAAC;aAAM,CAAC;YACP,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACrB,CAAC;IACF,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,wBAAwB,CAAC,CAAC;QACrG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjB,CAAC;AACF,CAAC,CAAC,CAAC;AAEJ,OAAO,CAAC,KAAK,EAAE,CAAC"}
@@ -0,0 +1,24 @@
1
+ export declare const ENTRY_POINT_ELEMENTS: string[];
2
+ export declare const MOBILE_WIDTH = 600;
3
+ export declare const BLOCK_ELEMENTS: string[];
4
+ export declare const BLOCK_ELEMENTS_SELECTOR: string;
5
+ export declare const BLOCK_ELEMENTS_SET: Set<string>;
6
+ export declare const BLOCK_LEVEL_ELEMENTS: Set<string>;
7
+ export declare const PRESERVE_ELEMENTS: Set<string>;
8
+ export declare const INLINE_ELEMENTS: Set<string>;
9
+ export declare const CONTENT_ELEMENT_SELECTOR: string;
10
+ export declare const HIDDEN_EXACT_SKIP_SELECTORS: string[];
11
+ export declare const HIDDEN_EXACT_SELECTORS: string[];
12
+ export declare const HIDDEN_EXACT_SELECTOR: string;
13
+ export declare const HIDDEN_EXACT_SKIP_SELECTOR: string;
14
+ export declare const EXACT_SELECTORS: string[];
15
+ export declare const EXACT_SELECTORS_JOINED: string;
16
+ export declare const TEST_ATTRIBUTES: string[];
17
+ export declare const PARTIAL_SELECTORS: string[];
18
+ export declare const PARTIAL_SELECTORS_REGEX: RegExp;
19
+ export declare const TEST_ATTRIBUTES_SELECTOR: string;
20
+ export declare const FOOTNOTE_INLINE_REFERENCES: string;
21
+ export declare const FOOTNOTE_LIST_SELECTORS: string;
22
+ export declare const ALLOWED_EMPTY_ELEMENTS: Set<string>;
23
+ export declare const ALLOWED_ATTRIBUTES: Set<string>;
24
+ export declare const ALLOWED_ATTRIBUTES_DEBUG: Set<string>;