defuddle 0.4.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -48
- package/dist/constants.js +744 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +3 -2
- package/dist/defuddle.js +1676 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/code.js +287 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/headings.js +95 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/math.base.js +192 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.full.d.ts +1 -1
- package/dist/elements/math.full.js +121 -0
- package/dist/elements/math.full.js.map +1 -0
- package/dist/extractor-registry.js +101 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.js +12 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.js +77 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.js +142 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.js +87 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/hackernews.js +206 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.js +143 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.js +199 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/youtube.js +53 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +1 -0
- package/dist/markdown.js +545 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.js +268 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +12 -0
- package/dist/node.js +50 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +8 -0
- package/dist/scoring.js +95 -0
- package/dist/scoring.js.map +1 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +19 -5
package/README.md
CHANGED
|
@@ -24,72 +24,63 @@ Defuddle can be used as a replacement for [Mozilla Readability](https://github.c
|
|
|
24
24
|
npm install defuddle
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
```typescript
|
|
30
|
-
import Defuddle from 'defuddle';
|
|
27
|
+
For Node.js usage, you'll also need to install JSDOM:
|
|
31
28
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
// Use the extracted content and metadata
|
|
35
|
-
console.log(article.content); // HTML string of the main content
|
|
36
|
-
console.log(article.title); // Title of the article
|
|
29
|
+
```bash
|
|
30
|
+
npm install jsdom
|
|
37
31
|
```
|
|
38
32
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
Defuddle comes in two bundles:
|
|
42
|
-
|
|
43
|
-
**Core bundle** (~50kB), no dependencies
|
|
44
|
-
```js
|
|
45
|
-
import Defuddle from 'defuddle';
|
|
46
|
-
```
|
|
47
|
-
**Full bundle** (~432kB), includes advanced math conversion capabilities
|
|
48
|
-
```js
|
|
49
|
-
import Defuddle from 'defuddle/full';
|
|
50
|
-
```
|
|
33
|
+
## Usage
|
|
51
34
|
|
|
52
|
-
|
|
35
|
+
### Browser
|
|
53
36
|
|
|
54
|
-
|
|
37
|
+
```javascript
|
|
38
|
+
import { Defuddle } from 'defuddle';
|
|
55
39
|
|
|
56
|
-
|
|
40
|
+
// Parse the current document
|
|
41
|
+
const defuddle = new Defuddle(document);
|
|
42
|
+
const result = defuddle.parse();
|
|
57
43
|
|
|
58
|
-
|
|
59
|
-
|
|
44
|
+
// Access the content and metadata
|
|
45
|
+
console.log(result.content);
|
|
46
|
+
console.log(result.title);
|
|
47
|
+
console.log(result.author);
|
|
60
48
|
```
|
|
61
49
|
|
|
62
|
-
|
|
63
|
-
- Preserves HTML class and id attributes that are normally stripped
|
|
64
|
-
- Retains all data-* attributes
|
|
65
|
-
- Skips div flattening to preserve document structure
|
|
50
|
+
### Node.js
|
|
66
51
|
|
|
67
|
-
|
|
52
|
+
```javascript
|
|
53
|
+
import { JSDOM } from 'jsdom';
|
|
54
|
+
import { Defuddle } from 'defuddle/node';
|
|
68
55
|
|
|
69
|
-
|
|
56
|
+
// Parse HTML from a string
|
|
57
|
+
const html = '<html><body><article>...</article></body></html>';
|
|
58
|
+
const result = await Defuddle(html);
|
|
70
59
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
60
|
+
// Parse HTML from a URL
|
|
61
|
+
const dom = await JSDOM.fromURL('https://example.com/article');
|
|
62
|
+
const result = await Defuddle(dom);
|
|
74
63
|
|
|
75
|
-
|
|
76
|
-
const
|
|
77
|
-
|
|
64
|
+
// With options
|
|
65
|
+
const result = await Defuddle(dom, {
|
|
66
|
+
debug: true, // Enable debug mode for verbose logging
|
|
67
|
+
markdown: true, // Convert content to markdown
|
|
68
|
+
url: 'https://example.com/article' // Original URL of the page
|
|
78
69
|
});
|
|
79
70
|
|
|
80
|
-
|
|
81
|
-
console.log(
|
|
71
|
+
// Access the content and metadata
|
|
72
|
+
console.log(result.content);
|
|
73
|
+
console.log(result.title);
|
|
74
|
+
console.log(result.author);
|
|
82
75
|
```
|
|
83
76
|
|
|
84
|
-
Providing `url` in the JSDOM constructor helps convert relative URLs (images, links, etc.) to absolute URLs.
|
|
85
|
-
|
|
86
77
|
## Response
|
|
87
78
|
|
|
88
|
-
|
|
79
|
+
Defuddle returns an object with the following properties:
|
|
89
80
|
|
|
90
81
|
| Property | Type | Description |
|
|
91
82
|
|----------|------|-------------|
|
|
92
|
-
| `content` | string |
|
|
83
|
+
| `content` | string | Cleaned up string of the extracted content |
|
|
93
84
|
| `title` | string | Title of the article |
|
|
94
85
|
| `description` | string | Description or summary of the article |
|
|
95
86
|
| `domain` | string | Domain name of the website |
|
|
@@ -102,6 +93,32 @@ The `parse()` method returns an object with the following properties:
|
|
|
102
93
|
| `schemaOrgData` | object | Raw schema.org data extracted from the page |
|
|
103
94
|
| `wordCount` | number | Total number of words in the extracted content |
|
|
104
95
|
|
|
96
|
+
## Bundles
|
|
97
|
+
|
|
98
|
+
Defuddle is available in three different bundles:
|
|
99
|
+
|
|
100
|
+
1. Core bundle (`defuddle`): The main bundle for browser usage. No dependencies.
|
|
101
|
+
2. Full bundle (`defuddle/full`): Includes additional features for math equation parsing.
|
|
102
|
+
3. Node.js bundle (`defuddle/node`): Optimized for Node.js environments using JSDOM. Includes full capabilities for math and Markdown conversion.
|
|
103
|
+
|
|
104
|
+
The core bundle is recommended for most use cases. It still handles math content, but doesn't include fallbacks for converting between MathML and LaTeX formats. The full bundle adds the ability to create reliable `<math>` elements using `mathml-to-latex` and `temml` libraries.
|
|
105
|
+
|
|
106
|
+
## Options
|
|
107
|
+
|
|
108
|
+
### Debug mode
|
|
109
|
+
|
|
110
|
+
You can enable debug mode by passing an options object when creating a new Defuddle instance:
|
|
111
|
+
|
|
112
|
+
```typescript
|
|
113
|
+
const article = new Defuddle(document, { debug: true }).parse();
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
- More verbose console logging about the parsing process
|
|
117
|
+
- Preserves HTML class and id attributes that are normally stripped
|
|
118
|
+
- Retains all data-* attributes
|
|
119
|
+
- Skips div flattening to preserve document structure
|
|
120
|
+
|
|
121
|
+
|
|
105
122
|
## HTML standardization
|
|
106
123
|
|
|
107
124
|
Defuddle attempts to standardize HTML elements to provide a consistent input for subsequent manipulation such as conversion to Markdown.
|
|
@@ -167,7 +184,3 @@ npm install
|
|
|
167
184
|
# Clean and build
|
|
168
185
|
npm run build
|
|
169
186
|
```
|
|
170
|
-
|
|
171
|
-
This will generate:
|
|
172
|
-
- `dist/index.js` - UMD build for both Node.js and browsers
|
|
173
|
-
- `dist/index.d.ts` - TypeScript declaration file
|