defuddle 0.3.7 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,6 +5,8 @@
5
5
 
6
6
  Defuddle extracts the main content from web pages. It cleans up web pages by removing clutter like comments, sidebars, headers, footers, and other non-essential elements, leaving only the primary content.
7
7
 
8
+ [Try the Defuddle Playground →](https://kepano.github.io/defuddle/)
9
+
8
10
  ## Features
9
11
 
10
12
  Defuddle aims to output clean and consistent HTML documents. It was written for [Obsidian Web Clipper](https://github.com/obsidianmd/obsidian-clipper) with the goal of creating a more useful input for HTML-to-Markdown converters like [Turndown](https://github.com/mixmark-io/turndown).
@@ -25,7 +27,7 @@ npm install defuddle
25
27
  ## Usage
26
28
 
27
29
  ```typescript
28
- import { Defuddle } from 'defuddle';
30
+ import Defuddle from 'defuddle';
29
31
 
30
32
  const article = new Defuddle(document).parse();
31
33
 
@@ -40,11 +42,11 @@ Defuddle comes in two bundles:
40
42
 
41
43
  **Core bundle** (~50kB), no dependencies
42
44
  ```js
43
- import { Defuddle } from 'defuddle';
45
+ import Defuddle from 'defuddle';
44
46
  ```
45
47
  **Full bundle** (~432kB), includes advanced math conversion capabilities
46
48
  ```js
47
- import { Defuddle } from 'defuddle/full';
49
+ import Defuddle from 'defuddle/full';
48
50
  ```
49
51
 
50
52
  The core bundle is recommended for most use cases. It still handles math content, but doesn't include fallbacks for converting between MathML and LaTeX formats. The full bundle adds the ability to create reliable `<math>` elements using `mathml-to-latex` and `temml` libraries.
@@ -67,7 +69,7 @@ const article = new Defuddle(document, { debug: true }).parse();
67
69
  When using Defuddle in a Node.js environment, you can use JSDOM to create a DOM document:
68
70
 
69
71
  ```typescript
70
- import { Defuddle } from 'defuddle';
72
+ import Defuddle from 'defuddle';
71
73
  import { JSDOM } from 'jsdom';
72
74
 
73
75
  const html = '...'; // Your HTML string
@@ -0,0 +1,15 @@
1
+ import { BaseExtractor } from './extractors/_base';
2
+ type ExtractorConstructor = new (document: Document, url: string, schemaOrgData?: any) => BaseExtractor;
3
+ interface ExtractorMapping {
4
+ patterns: (string | RegExp)[];
5
+ extractor: ExtractorConstructor;
6
+ }
7
+ export declare class ExtractorRegistry {
8
+ private static mappings;
9
+ private static domainCache;
10
+ static initialize(): void;
11
+ static register(mapping: ExtractorMapping): void;
12
+ static findExtractor(document: Document, url: string, schemaOrgData?: any): BaseExtractor | null;
13
+ static clearCache(): void;
14
+ }
15
+ export {};
@@ -0,0 +1,9 @@
1
+ import { ExtractorResult } from '../types/extractors';
2
+ export declare abstract class BaseExtractor {
3
+ protected document: Document;
4
+ protected url: string;
5
+ protected schemaOrgData?: any;
6
+ constructor(document: Document, url: string, schemaOrgData?: any);
7
+ abstract canExtract(): boolean;
8
+ abstract extract(): ExtractorResult;
9
+ }
@@ -0,0 +1,9 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ConversationMessage, ConversationMetadata, Footnote, ExtractorResult } from '../types/extractors';
3
+ export declare abstract class ConversationExtractor extends BaseExtractor {
4
+ protected abstract extractMessages(): ConversationMessage[];
5
+ protected abstract getMetadata(): ConversationMetadata;
6
+ protected getFootnotes(): Footnote[];
7
+ extract(): ExtractorResult;
8
+ protected createContentHtml(messages: ConversationMessage[], footnotes: Footnote[]): string;
9
+ }
@@ -0,0 +1,13 @@
1
+ import { ConversationExtractor } from './_conversation';
2
+ import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
3
+ export declare class ChatGPTExtractor extends ConversationExtractor {
4
+ private articles;
5
+ private footnotes;
6
+ private footnoteCounter;
7
+ constructor(document: Document, url: string);
8
+ canExtract(): boolean;
9
+ protected extractMessages(): ConversationMessage[];
10
+ protected getFootnotes(): Footnote[];
11
+ protected getMetadata(): ConversationMetadata;
12
+ private getTitle;
13
+ }
@@ -0,0 +1,10 @@
1
+ import { ConversationExtractor } from './_conversation';
2
+ import { ConversationMessage, ConversationMetadata } from '../types/extractors';
3
+ export declare class ClaudeExtractor extends ConversationExtractor {
4
+ private articles;
5
+ constructor(document: Document, url: string);
6
+ canExtract(): boolean;
7
+ protected extractMessages(): ConversationMessage[];
8
+ protected getMetadata(): ConversationMetadata;
9
+ private getTitle;
10
+ }
@@ -0,0 +1,21 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ExtractorResult } from '../types/extractors';
3
+ export declare class HackerNewsExtractor extends BaseExtractor {
4
+ private mainPost;
5
+ private isCommentPage;
6
+ private mainComment;
7
+ constructor(document: Document, url: string);
8
+ private detectCommentPage;
9
+ private findMainComment;
10
+ canExtract(): boolean;
11
+ extract(): ExtractorResult;
12
+ private createContentHtml;
13
+ private getPostContent;
14
+ private extractComments;
15
+ private processComments;
16
+ private getPostId;
17
+ private getPostTitle;
18
+ private getPostAuthor;
19
+ private createDescription;
20
+ private getPostDate;
21
+ }
@@ -0,0 +1,16 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ExtractorResult } from '../types/extractors';
3
+ export declare class RedditExtractor extends BaseExtractor {
4
+ private shredditPost;
5
+ constructor(document: Document, url: string);
6
+ canExtract(): boolean;
7
+ extract(): ExtractorResult;
8
+ private getPostContent;
9
+ private createContentHtml;
10
+ private extractComments;
11
+ private getPostId;
12
+ private getSubreddit;
13
+ private getPostAuthor;
14
+ private createDescription;
15
+ private processComments;
16
+ }
@@ -0,0 +1,16 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ExtractorResult } from '../types/extractors';
3
+ export declare class TwitterExtractor extends BaseExtractor {
4
+ private mainTweet;
5
+ private threadTweets;
6
+ constructor(document: Document, url: string);
7
+ canExtract(): boolean;
8
+ extract(): ExtractorResult;
9
+ private formatTweetText;
10
+ private extractTweet;
11
+ private extractUserInfo;
12
+ private extractImages;
13
+ private getTweetId;
14
+ private getTweetAuthor;
15
+ private createDescription;
16
+ }
@@ -0,0 +1,12 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ExtractorResult } from '../types/extractors';
3
+ export declare class YoutubeExtractor extends BaseExtractor {
4
+ private videoElement;
5
+ protected schemaOrgData: any;
6
+ constructor(document: Document, url: string, schemaOrgData?: any);
7
+ canExtract(): boolean;
8
+ extract(): ExtractorResult;
9
+ private formatDescription;
10
+ private getVideoData;
11
+ private getVideoId;
12
+ }
package/dist/index.d.ts CHANGED
@@ -1,2 +1,3 @@
1
- export { Defuddle } from './defuddle';
1
+ import { Defuddle } from './defuddle';
2
2
  export type { DefuddleOptions, DefuddleResponse, DefuddleMetadata } from './types';
3
+ export default Defuddle;
@@ -1,3 +1,4 @@
1
1
  import { Defuddle } from './defuddle';
2
2
  import { DefuddleOptions, DefuddleResponse } from './types';
3
- export { Defuddle, DefuddleOptions, DefuddleResponse };
3
+ export type { DefuddleOptions, DefuddleResponse };
4
+ export default Defuddle;