defuddle 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -0
- package/dist/constants.d.ts +14 -0
- package/dist/defuddle.d.ts +3 -0
- package/dist/index.js +1 -1
- package/dist/math.d.ts +14 -0
- package/dist/metadata.d.ts +1 -0
- package/dist/types.d.ts +2 -0
- package/package.json +6 -2
package/README.md
CHANGED
|
@@ -34,6 +34,19 @@ console.log(article.content); // HTML string of the main content
|
|
|
34
34
|
console.log(article.title); // Title of the article
|
|
35
35
|
```
|
|
36
36
|
|
|
37
|
+
### Debug mode
|
|
38
|
+
|
|
39
|
+
You can enable debug mode by passing an options object when creating a new Defuddle instance:
|
|
40
|
+
|
|
41
|
+
```typescript
|
|
42
|
+
const article = new Defuddle(document, { debug: true }).parse();
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
- More verbose console logging about the parsing process
|
|
46
|
+
- Preserves HTML class and id attributes that are normally stripped
|
|
47
|
+
- Retains all data-* attributes
|
|
48
|
+
- Skips div flattening to preserve document structure
|
|
49
|
+
|
|
37
50
|
### Server-side usage
|
|
38
51
|
|
|
39
52
|
When using Defuddle in a Node.js environment, you can use JSDOM to create a DOM document:
|
|
@@ -65,10 +78,12 @@ The `parse()` method returns an object with the following properties:
|
|
|
65
78
|
| `domain` | string | Domain name of the website |
|
|
66
79
|
| `favicon` | string | URL of the website's favicon |
|
|
67
80
|
| `image` | string | URL of the article's main image |
|
|
81
|
+
| `parseTime` | number | Time taken to parse the page in milliseconds |
|
|
68
82
|
| `published` | string | Publication date of the article |
|
|
69
83
|
| `author` | string | Author of the article |
|
|
70
84
|
| `site` | string | Name of the website |
|
|
71
85
|
| `schemaOrgData` | object | Raw schema.org data extracted from the page |
|
|
86
|
+
| `wordCount` | number | Total number of words in the extracted content |
|
|
72
87
|
|
|
73
88
|
## HTML standardization
|
|
74
89
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export declare const ENTRY_POINT_ELEMENTS: string[];
|
|
2
|
+
export declare const MOBILE_WIDTH = 600;
|
|
3
|
+
export declare const BLOCK_ELEMENTS: string[];
|
|
4
|
+
export declare const PRESERVE_ELEMENTS: Set<string>;
|
|
5
|
+
export declare const INLINE_ELEMENTS: Set<string>;
|
|
6
|
+
export declare const HIDDEN_ELEMENT_SELECTORS: string;
|
|
7
|
+
export declare const EXACT_SELECTORS: string[];
|
|
8
|
+
export declare const PARTIAL_SELECTORS: string[];
|
|
9
|
+
export declare const FOOTNOTE_INLINE_REFERENCES: string;
|
|
10
|
+
export declare const FOOTNOTE_LIST_SELECTORS: string;
|
|
11
|
+
export declare const ALLOWED_EMPTY_ELEMENTS: Set<string>;
|
|
12
|
+
export declare const ALLOWED_ATTRIBUTES: Set<string>;
|
|
13
|
+
export declare const ALLOWED_ATTRIBUTES_DEBUG: Set<string>;
|
|
14
|
+
export declare const SUPPORTED_LANGUAGES: Set<string>;
|
package/dist/defuddle.d.ts
CHANGED
|
@@ -13,17 +13,20 @@ export declare class Defuddle {
|
|
|
13
13
|
* Parse the document and extract its main content
|
|
14
14
|
*/
|
|
15
15
|
parse(): DefuddleResponse;
|
|
16
|
+
private countWords;
|
|
16
17
|
private _log;
|
|
17
18
|
private _evaluateMediaQueries;
|
|
18
19
|
private applyMobileStyles;
|
|
19
20
|
private removeHiddenElements;
|
|
20
21
|
private removeClutter;
|
|
22
|
+
private flattenDivs;
|
|
21
23
|
private cleanContent;
|
|
22
24
|
private removeTrailingHeadings;
|
|
23
25
|
private handleHeadings;
|
|
24
26
|
private removeHtmlComments;
|
|
25
27
|
private stripUnwantedAttributes;
|
|
26
28
|
private removeEmptyElements;
|
|
29
|
+
private removeEmptyLines;
|
|
27
30
|
private createFootnoteItem;
|
|
28
31
|
private collectFootnotes;
|
|
29
32
|
private findOuterFootnoteContainer;
|