@mz1999/defuddle 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +371 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +145 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +24 -0
- package/dist/constants.js +950 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +136 -0
- package/dist/defuddle.js +1816 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.d.ts +5 -0
- package/dist/elements/code.js +346 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/footnotes.d.ts +5 -0
- package/dist/elements/footnotes.js +619 -0
- package/dist/elements/footnotes.js.map +1 -0
- package/dist/elements/headings.d.ts +11 -0
- package/dist/elements/headings.js +100 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +877 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.base.d.ts +9 -0
- package/dist/elements/math.base.js +195 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.core.d.ts +7 -0
- package/dist/elements/math.core.js +52 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.d.ts +2 -0
- package/dist/elements/math.full.d.ts +8 -0
- package/dist/elements/math.js +7 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/extractor-registry.d.ts +16 -0
- package/dist/extractor-registry.js +140 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +22 -0
- package/dist/extractors/_base.js +27 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +78 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +14 -0
- package/dist/extractors/chatgpt.js +138 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +91 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/gemini.d.ts +14 -0
- package/dist/extractors/gemini.js +111 -0
- package/dist/extractors/gemini.js.map +1 -0
- package/dist/extractors/github.d.ts +20 -0
- package/dist/extractors/github.js +251 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +142 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +155 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +22 -0
- package/dist/extractors/reddit.js +197 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +204 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/x-article.d.ts +24 -0
- package/dist/extractors/x-article.js +267 -0
- package/dist/extractors/x-article.js.map +1 -0
- package/dist/extractors/x-oembed.d.ts +20 -0
- package/dist/extractors/x-oembed.js +350 -0
- package/dist/extractors/x-oembed.js.map +1 -0
- package/dist/extractors/youtube.d.ts +87 -0
- package/dist/extractors/youtube.js +869 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/fetch.d.ts +18 -0
- package/dist/fetch.js +265 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.full.d.ts +12 -0
- package/dist/index.full.js +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +30 -0
- package/dist/markdown.js +661 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.d.ts +25 -0
- package/dist/metadata.js +426 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +19 -0
- package/dist/node.js +78 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +31 -0
- package/dist/scoring.js +472 -0
- package/dist/scoring.js.map +1 -0
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +1101 -0
- package/dist/standardize.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +135 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +42 -0
- package/dist/utils/dom.js +104 -0
- package/dist/utils/dom.js.map +1 -0
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +98 -0
- package/dist/utils.js.map +1 -0
- package/package.json +107 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.RedditExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
const comments_1 = require("../utils/comments");
|
|
7
|
+
class RedditExtractor extends _base_1.BaseExtractor {
|
|
8
|
+
constructor(document, url) {
|
|
9
|
+
super(document, url);
|
|
10
|
+
this.shredditPost = document.querySelector('shreddit-post');
|
|
11
|
+
this.isOldReddit = !!document.querySelector('.thing.link');
|
|
12
|
+
}
|
|
13
|
+
canExtract() {
|
|
14
|
+
return !!this.shredditPost || this.isOldReddit;
|
|
15
|
+
}
|
|
16
|
+
canExtractAsync() {
|
|
17
|
+
// For new reddit comment pages, extract() returns empty content
|
|
18
|
+
// when shreddit-comment elements are missing (server-side fetch),
|
|
19
|
+
// causing parseAsync() to fall through to this async path.
|
|
20
|
+
return this.isCommentsPage() && !this.isOldReddit;
|
|
21
|
+
}
|
|
22
|
+
isCommentsPage() {
|
|
23
|
+
return /\/r\/.+\/comments\//.test(this.url);
|
|
24
|
+
}
|
|
25
|
+
async extractAsync() {
|
|
26
|
+
// Convert URL to old.reddit.com
|
|
27
|
+
const oldUrl = new URL(this.url);
|
|
28
|
+
oldUrl.hostname = 'old.reddit.com';
|
|
29
|
+
const response = await fetch(oldUrl.toString(), {
|
|
30
|
+
headers: {
|
|
31
|
+
'User-Agent': 'Mozilla/5.0 (compatible; Defuddle/1.0)',
|
|
32
|
+
},
|
|
33
|
+
});
|
|
34
|
+
if (!response.ok) {
|
|
35
|
+
throw new Error(`Failed to fetch old.reddit.com: ${response.status}`);
|
|
36
|
+
}
|
|
37
|
+
const html = await response.text();
|
|
38
|
+
const Parser = this.document.defaultView?.DOMParser ?? (typeof DOMParser !== 'undefined' ? DOMParser : null);
|
|
39
|
+
if (!Parser) {
|
|
40
|
+
throw new Error('DOMParser is not available in this environment');
|
|
41
|
+
}
|
|
42
|
+
const doc = new Parser().parseFromString(html, 'text/html');
|
|
43
|
+
return this.extractOldReddit(doc);
|
|
44
|
+
}
|
|
45
|
+
extract() {
|
|
46
|
+
if (this.isOldReddit) {
|
|
47
|
+
return this.extractOldReddit(this.document);
|
|
48
|
+
}
|
|
49
|
+
// New reddit server-side HTML includes shreddit-post but not
|
|
50
|
+
// shreddit-comment elements (those require JS). Return empty
|
|
51
|
+
// so parseAsync() falls through to extractAsync() which fetches
|
|
52
|
+
// old.reddit.com with full content.
|
|
53
|
+
const hasComments = this.document.querySelectorAll('shreddit-comment').length > 0;
|
|
54
|
+
if (this.isCommentsPage() && !hasComments) {
|
|
55
|
+
return { content: '', contentHtml: '' };
|
|
56
|
+
}
|
|
57
|
+
const postContent = this.getPostContent();
|
|
58
|
+
const comments = this.options.includeReplies !== false ? this.extractComments() : '';
|
|
59
|
+
const contentHtml = this.createContentHtml(postContent, comments);
|
|
60
|
+
const postTitle = this.document.querySelector('h1')?.textContent?.trim() || '';
|
|
61
|
+
const subreddit = this.getSubreddit();
|
|
62
|
+
const postAuthor = this.getPostAuthor();
|
|
63
|
+
const description = this.createDescription(postContent);
|
|
64
|
+
return {
|
|
65
|
+
content: contentHtml,
|
|
66
|
+
contentHtml: contentHtml,
|
|
67
|
+
extractedContent: {
|
|
68
|
+
postId: this.getPostId(),
|
|
69
|
+
subreddit,
|
|
70
|
+
postAuthor,
|
|
71
|
+
},
|
|
72
|
+
variables: {
|
|
73
|
+
title: postTitle,
|
|
74
|
+
author: postAuthor,
|
|
75
|
+
site: `r/${subreddit}`,
|
|
76
|
+
description,
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
extractOldReddit(root) {
|
|
81
|
+
const thingLink = root.querySelector('.thing.link');
|
|
82
|
+
const postTitle = thingLink?.querySelector('a.title')?.textContent?.trim() || '';
|
|
83
|
+
const postAuthor = thingLink?.getAttribute('data-author') || '';
|
|
84
|
+
const subreddit = thingLink?.getAttribute('data-subreddit') || '';
|
|
85
|
+
const postBodyEl = thingLink?.querySelector('.usertext-body .md');
|
|
86
|
+
const postBody = postBodyEl ? (0, dom_1.serializeHTML)(postBodyEl) : '';
|
|
87
|
+
let comments = '';
|
|
88
|
+
if (this.options.includeReplies !== false) {
|
|
89
|
+
const commentArea = root.querySelector('.commentarea .sitetable');
|
|
90
|
+
const commentData = commentArea ? this.collectOldRedditComments(commentArea) : [];
|
|
91
|
+
comments = commentData.length > 0 ? (0, comments_1.buildCommentTree)(commentData) : '';
|
|
92
|
+
}
|
|
93
|
+
const contentHtml = this.createContentHtml(postBody, comments);
|
|
94
|
+
const description = this.createDescription(postBody);
|
|
95
|
+
return {
|
|
96
|
+
content: contentHtml,
|
|
97
|
+
contentHtml: contentHtml,
|
|
98
|
+
extractedContent: {
|
|
99
|
+
postId: this.getPostId(),
|
|
100
|
+
subreddit,
|
|
101
|
+
postAuthor,
|
|
102
|
+
},
|
|
103
|
+
variables: {
|
|
104
|
+
title: postTitle,
|
|
105
|
+
author: postAuthor,
|
|
106
|
+
site: `r/${subreddit}`,
|
|
107
|
+
description,
|
|
108
|
+
}
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
getPostContent() {
|
|
112
|
+
const textBodyEl = this.shredditPost?.querySelector('[slot="text-body"]');
|
|
113
|
+
const textBody = textBodyEl ? (0, dom_1.serializeHTML)(textBodyEl) : '';
|
|
114
|
+
const mediaBody = this.shredditPost?.querySelector('#post-image')?.outerHTML || '';
|
|
115
|
+
return textBody + mediaBody;
|
|
116
|
+
}
|
|
117
|
+
createContentHtml(postContent, comments) {
|
|
118
|
+
return (0, comments_1.buildContentHtml)('reddit', postContent, comments);
|
|
119
|
+
}
|
|
120
|
+
extractComments() {
|
|
121
|
+
const comments = Array.from(this.document.querySelectorAll('shreddit-comment'));
|
|
122
|
+
return this.processComments(comments);
|
|
123
|
+
}
|
|
124
|
+
getPostId() {
|
|
125
|
+
const match = this.url.match(/comments\/([a-zA-Z0-9]+)/);
|
|
126
|
+
return match?.[1] || '';
|
|
127
|
+
}
|
|
128
|
+
getSubreddit() {
|
|
129
|
+
const match = this.url.match(/\/r\/([^/]+)/);
|
|
130
|
+
return match?.[1] || '';
|
|
131
|
+
}
|
|
132
|
+
getPostAuthor() {
|
|
133
|
+
return this.shredditPost?.getAttribute('author') || '';
|
|
134
|
+
}
|
|
135
|
+
createDescription(postContent) {
|
|
136
|
+
if (!postContent)
|
|
137
|
+
return '';
|
|
138
|
+
const tempDiv = this.document.createElement('div');
|
|
139
|
+
tempDiv.appendChild((0, dom_1.parseHTML)(this.document, postContent));
|
|
140
|
+
return tempDiv.textContent?.trim()
|
|
141
|
+
.slice(0, 140)
|
|
142
|
+
.replace(/\s+/g, ' ') || '';
|
|
143
|
+
}
|
|
144
|
+
collectOldRedditComments(container, depth = 0) {
|
|
145
|
+
const result = [];
|
|
146
|
+
const comments = Array.from(container.querySelectorAll(':scope > .thing.comment'));
|
|
147
|
+
for (const comment of comments) {
|
|
148
|
+
const author = comment.getAttribute('data-author') || '';
|
|
149
|
+
const permalink = comment.getAttribute('data-permalink') || '';
|
|
150
|
+
const score = comment.querySelector('.entry .tagline .score.unvoted')?.textContent?.trim() || '';
|
|
151
|
+
const timeEl = comment.querySelector('.entry .tagline time[datetime]');
|
|
152
|
+
const datetime = timeEl?.getAttribute('datetime') || '';
|
|
153
|
+
const date = datetime ? new Date(datetime).toISOString().split('T')[0] : '';
|
|
154
|
+
const bodyEl = comment.querySelector('.entry .usertext-body .md');
|
|
155
|
+
const body = bodyEl ? (0, dom_1.serializeHTML)(bodyEl) : '';
|
|
156
|
+
result.push({
|
|
157
|
+
author,
|
|
158
|
+
date,
|
|
159
|
+
content: body,
|
|
160
|
+
depth,
|
|
161
|
+
score: score || undefined,
|
|
162
|
+
url: permalink ? `https://reddit.com${permalink}` : undefined,
|
|
163
|
+
});
|
|
164
|
+
const childContainer = comment.querySelector('.child > .sitetable');
|
|
165
|
+
if (childContainer) {
|
|
166
|
+
result.push(...this.collectOldRedditComments(childContainer, depth + 1));
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
return result;
|
|
170
|
+
}
|
|
171
|
+
processComments(comments) {
|
|
172
|
+
const commentData = [];
|
|
173
|
+
for (const comment of comments) {
|
|
174
|
+
const depth = parseInt(comment.getAttribute('depth') || '0');
|
|
175
|
+
const author = comment.getAttribute('author') || '';
|
|
176
|
+
const score = comment.getAttribute('score') || '0';
|
|
177
|
+
const permalink = comment.getAttribute('permalink') || '';
|
|
178
|
+
const commentEl = comment.querySelector('[slot="comment"]');
|
|
179
|
+
const content = commentEl ? (0, dom_1.serializeHTML)(commentEl) : '';
|
|
180
|
+
const timestamp = comment.getAttribute('created')
|
|
181
|
+
|| comment.querySelector('time')?.getAttribute('datetime')
|
|
182
|
+
|| '';
|
|
183
|
+
const date = timestamp ? new Date(timestamp).toISOString().split('T')[0] : '';
|
|
184
|
+
commentData.push({
|
|
185
|
+
author,
|
|
186
|
+
date,
|
|
187
|
+
content,
|
|
188
|
+
depth,
|
|
189
|
+
score: `${score} points`,
|
|
190
|
+
url: permalink ? `https://reddit.com${permalink}` : undefined,
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
return (0, comments_1.buildCommentTree)(commentData);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
exports.RedditExtractor = RedditExtractor;
|
|
197
|
+
//# sourceMappingURL=reddit.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reddit.js","sourceRoot":"","sources":["../../src/extractors/reddit.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,sCAAwD;AACxD,gDAAyF;AAEzF,MAAa,eAAgB,SAAQ,qBAAa;IAIjD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACrB,IAAI,CAAC,YAAY,GAAG,QAAQ,CAAC,aAAa,CAAC,eAAe,CAAC,CAAC;QAC5D,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC,QAAQ,CAAC,aAAa,CAAC,aAAa,CAAC,CAAC;IAC5D,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,WAAW,CAAC;IAChD,CAAC;IAED,eAAe;QACd,gEAAgE;QAChE,kEAAkE;QAClE,2DAA2D;QAC3D,OAAO,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC;IACnD,CAAC;IAEO,cAAc;QACrB,OAAO,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC7C,CAAC;IAED,KAAK,CAAC,YAAY;QACjB,gCAAgC;QAChC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjC,MAAM,CAAC,QAAQ,GAAG,gBAAgB,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE;YAC/C,OAAO,EAAE;gBACR,YAAY,EAAE,wCAAwC;aACtD;SACD,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,mCAAmC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,SAAS,IAAI,CAAC,OAAO,SAAS,KAAK,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC7G,IAAI,CAAC,MAAM,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAC;QACnE,CAAC;QACD,MAAM,GAAG,GAAG,IAAI,MAAM,EAAE,CAAC,eAAe,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;QAE5D,OAAO,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;IACnC,CAAC;IAED,OAAO;QACN,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACtB,OAAO,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC7C,CAAC;QAED,6DAA6D;QAC7D,6DAA6D;QAC7D,gEAAgE;QAChE,oCAAoC;QACpC,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAClF,IAAI,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC;YAC3C,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,WAAW,EAAE,EAAE,EAAE,CAAC;QACzC,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAErF,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;QAClE,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC/E,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QACxC,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC;QAExD,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,MAAM,EAAE,IAAI,CAAC,SAAS,EAAE;gBACxB,SAAS;gBACT,UAAU;aACV;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,SAAS;gBAChB,MAAM,EAAE,UAAU;gBAClB,IAAI,EAAE,KAAK,SAAS,EAAE;gBACtB,WAAW;aACX;SACD,CAAC;IACH,CAAC;IAEO,gBAAgB,CAAC,IAAwB;QAChD,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,CAAC,CAAC;QACpD,MAAM,SAAS,GAAG,SAAS,EAAE,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACjF,MAAM,UAAU,GAAG,SAAS,EAAE,YAAY,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC;QAChE,MAAM,SAAS,GAAG,SAAS,EAAE,YAAY,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC;QAClE,MAAM,UAAU,GAAG,SAAS,EAAE,aAAa,CAAC,oBAAoB,CAAC,CAAC;QAClE,MAAM,QAAQ,GAAG,UAAU,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE7D,IAAI,QAAQ,GAAG,EAAE,CAAC;QAClB,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,KAAK,EAAE,CAAC;YAC3C,MAAM,WAAW,GAAG,IAAI,CAAC,aAAa,CAAC,yBAAyB,CAAC,CAAC;YAClE,MAAM,WAAW,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAClF,QAAQ,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAA,2BAAgB,EAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxE,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;QAC/D,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAErD,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,MAAM,EAAE,IAAI,CAAC,SAAS,EAAE;gBACxB,SAAS;gBACT,UAAU;aACV;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,SAAS;gBAChB,MAAM,EAAE,UAAU;gBAClB,IAAI,EAAE,KAAK,SAAS,EAAE;gBACtB,WAAW;aACX;SACD,CAAC;IACH,CAAC;IAEO,cAAc;QACrB,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,EAAE,aAAa,CAAC,oBAAoB,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,UAAU,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,aAAa,CAAC,aAAa,CAAC,EAAE,SAAS,IAAI,EAAE,CAAC;QAEnF,OAAO,QAAQ,GAAG,SAAS,CAAC;IAC7B,CAAC;IAEO,iBAAiB,CAAC,WAAmB,EAAE,QAAgB;QAC9D,OAAO,IAAA,2BAAgB,EAAC,QAAQ,EAAE,WAAW,EAAE,QAAQ,CAAC,CAAC;IAC1D,CAAC;IAEO,eAAe;QACtB,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC,CAAC;QAChF,OAAO,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC;IAEO,SAAS;QAChB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QACzD,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAEO,YAAY;QACnB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC7C,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAEO,aAAa;QACpB,OAAO,IAAI,CAAC,YAAY,EAAE,YAAY,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;IACxD,CAAC;IAEO,iBAAiB,CAAC,WAAmB;QAC5C,IAAI,CAAC,WAAW;YAAE,OAAO,EAAE,CAAC;QAE5B,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACnD,OAAO,CAAC,WAAW,CAAC,IAAA,eAAS,EAAC,IAAI,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAC,CAAC;QAC3D,OAAO,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE;aAChC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;aACb,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC;IAC9B,CAAC;IAEO,wBAAwB,CAAC,SAAkB,EAAE,QAAgB,CAAC;QACrE,MAAM,MAAM,GAAkB,EAAE,CAAC;QACjC,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,yBAAyB,CAAC,CAAC,CAAC;QAEnF,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,MAAM,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC;YACzD,MAAM,SAAS,GAAG,OAAO,CAAC,YAAY,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC;YAC/D,MAAM,KAAK,GAAG,OAAO,CAAC,aAAa,CAAC,gCAAgC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YACjG,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,gCAAgC,CAAC,CAAC;YACvE,MAAM,QAAQ,GAAG,MAAM,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;YACxD,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5E,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,2BAA2B,CAAC,CAAC;YAClE,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAEjD,MAAM,CAAC,IAAI,CAAC;gBACX,MAAM;gBACN,IAAI;gBACJ,OAAO,EAAE,IAAI;gBACb,KAAK;gBACL,KAAK,EAAE,KAAK,IAAI,SAAS;gBACzB,GAAG,EAAE,SAAS,CAAC,CAAC,CAAC,qBAAqB,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS;aAC7D,CAAC,CAAC;YAEH,MAAM,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC,qBAAqB,CAAC,CAAC;YACpE,IAAI,cAAc,EAAE,CAAC;gBACpB,MAAM,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,wBAAwB,CAAC,cAAc,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC;YAC1E,CAAC;QACF,CAAC;QAED,OAAO,MAAM,CAAC;IACf,CAAC;IAEO,eAAe,CAAC,QAAmB;QAC1C,MAAM,WAAW,GAAkB,EAAE,CAAC;QAEtC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,CAAC;YAC7D,MAAM,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACpD,MAAM,KAAK,GAAG,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YACnD,MAAM,SAAS,GAAG,OAAO,CAAC,YAAY,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;YAC1D,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,kBAAkB,CAAC,CAAC;YAC5D,MAAM,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAE1D,MAAM,SAAS,GAAG,OAAO,CAAC,YAAY,CAAC,SAAS,CAAC;mBAC7C,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,EAAE,YAAY,CAAC,UAAU,CAAC;mBACvD,EAAE,CAAC;YACP,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAE9E,WAAW,CAAC,IAAI,CAAC;gBAChB,MAAM;gBACN,IAAI;gBACJ,OAAO;gBACP,KAAK;gBACL,KAAK,EAAE,GAAG,KAAK,SAAS;gBACxB,GAAG,EAAE,SAAS,CAAC,CAAC,CAAC,qBAAqB,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS;aAC7D,CAAC,CAAC;QACJ,CAAC;QAED,OAAO,IAAA,2BAAgB,EAAC,WAAW,CAAC,CAAC;IACtC,CAAC;CACD;AAlOD,0CAkOC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { BaseExtractor } from './_base';
|
|
2
|
+
import { ExtractorResult } from '../types/extractors';
|
|
3
|
+
export declare class TwitterExtractor extends BaseExtractor {
|
|
4
|
+
private mainTweet;
|
|
5
|
+
private threadTweets;
|
|
6
|
+
constructor(document: Document, url: string);
|
|
7
|
+
canExtract(): boolean;
|
|
8
|
+
extract(): ExtractorResult;
|
|
9
|
+
private formatTweetText;
|
|
10
|
+
private extractTweet;
|
|
11
|
+
private extractUserInfo;
|
|
12
|
+
private extractImages;
|
|
13
|
+
private getTweetId;
|
|
14
|
+
private getTweetAuthor;
|
|
15
|
+
private createDescription;
|
|
16
|
+
}
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.TwitterExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
class TwitterExtractor extends _base_1.BaseExtractor {
|
|
7
|
+
constructor(document, url) {
|
|
8
|
+
super(document, url);
|
|
9
|
+
this.mainTweet = null;
|
|
10
|
+
this.threadTweets = [];
|
|
11
|
+
// Get all tweets from the timeline
|
|
12
|
+
const timeline = document.querySelector('[aria-label="Timeline: Conversation"]');
|
|
13
|
+
if (!timeline) {
|
|
14
|
+
// Try to find a single tweet if not in timeline view
|
|
15
|
+
const singleTweet = document.querySelector('article[data-testid="tweet"]');
|
|
16
|
+
if (singleTweet) {
|
|
17
|
+
this.mainTweet = singleTweet;
|
|
18
|
+
}
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
// Get all tweets before any section with "Discover more" or similar headings
|
|
22
|
+
let allTweets = Array.from(timeline.querySelectorAll('article[data-testid="tweet"]'));
|
|
23
|
+
const firstSection = timeline.querySelector('section, h2')?.parentElement;
|
|
24
|
+
if (firstSection) {
|
|
25
|
+
// Filter out tweets that appear after the first section
|
|
26
|
+
const cutoffIndex = allTweets.findIndex(tweet => firstSection.compareDocumentPosition(tweet) & Node.DOCUMENT_POSITION_FOLLOWING);
|
|
27
|
+
if (cutoffIndex !== -1) {
|
|
28
|
+
allTweets = allTweets.slice(0, cutoffIndex);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
// Set main tweet and thread tweets
|
|
32
|
+
this.mainTweet = allTweets[0] || null;
|
|
33
|
+
this.threadTweets = allTweets.slice(1);
|
|
34
|
+
}
|
|
35
|
+
canExtract() {
|
|
36
|
+
return !!this.mainTweet;
|
|
37
|
+
}
|
|
38
|
+
extract() {
|
|
39
|
+
const mainContent = this.extractTweet(this.mainTweet);
|
|
40
|
+
const threadContent = this.options.includeReplies !== false
|
|
41
|
+
? this.threadTweets.map(tweet => this.extractTweet(tweet)).join('\n<hr>\n')
|
|
42
|
+
: '';
|
|
43
|
+
const contentHtml = `
|
|
44
|
+
<div class="tweet-thread">
|
|
45
|
+
<div class="main-tweet">
|
|
46
|
+
${mainContent}
|
|
47
|
+
</div>
|
|
48
|
+
${threadContent ? `
|
|
49
|
+
<hr>
|
|
50
|
+
<div class="thread-tweets">
|
|
51
|
+
${threadContent}
|
|
52
|
+
</div>
|
|
53
|
+
` : ''}
|
|
54
|
+
</div>
|
|
55
|
+
`.trim();
|
|
56
|
+
const tweetId = this.getTweetId();
|
|
57
|
+
const tweetAuthor = this.getTweetAuthor();
|
|
58
|
+
const description = this.createDescription(this.mainTweet);
|
|
59
|
+
return {
|
|
60
|
+
content: contentHtml,
|
|
61
|
+
contentHtml: contentHtml,
|
|
62
|
+
extractedContent: {
|
|
63
|
+
tweetId,
|
|
64
|
+
tweetAuthor,
|
|
65
|
+
},
|
|
66
|
+
variables: {
|
|
67
|
+
title: `Thread by ${tweetAuthor}`,
|
|
68
|
+
author: tweetAuthor,
|
|
69
|
+
site: 'X (Twitter)',
|
|
70
|
+
description,
|
|
71
|
+
}
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
formatTweetText(text) {
|
|
75
|
+
if (!text)
|
|
76
|
+
return '';
|
|
77
|
+
// Create a temporary div to parse and clean the HTML
|
|
78
|
+
const tempDiv = this.document.createElement('div');
|
|
79
|
+
tempDiv.appendChild((0, dom_1.parseHTML)(this.document, text));
|
|
80
|
+
// Convert links to plain text with @ handles
|
|
81
|
+
tempDiv.querySelectorAll('a').forEach(link => {
|
|
82
|
+
const handle = link.textContent?.trim() || '';
|
|
83
|
+
link.replaceWith(handle);
|
|
84
|
+
});
|
|
85
|
+
// Remove unnecessary spans and divs but keep their content
|
|
86
|
+
tempDiv.querySelectorAll('span, div').forEach(element => {
|
|
87
|
+
element.replaceWith(...Array.from(element.childNodes));
|
|
88
|
+
});
|
|
89
|
+
// Get cleaned text and split into paragraphs
|
|
90
|
+
const cleanText = (0, dom_1.serializeHTML)(tempDiv);
|
|
91
|
+
const paragraphs = cleanText.split('\n')
|
|
92
|
+
.map(line => line.trim())
|
|
93
|
+
.filter(line => line);
|
|
94
|
+
// Wrap each paragraph in <p> tags
|
|
95
|
+
return paragraphs.map(p => `<p>${p}</p>`).join('\n');
|
|
96
|
+
}
|
|
97
|
+
extractTweet(tweet) {
|
|
98
|
+
if (!tweet)
|
|
99
|
+
return '';
|
|
100
|
+
// Clone the tweet element to modify it
|
|
101
|
+
const tweetClone = tweet.cloneNode(true);
|
|
102
|
+
// Convert emoji images to text
|
|
103
|
+
tweetClone.querySelectorAll('img[src*="/emoji/"]').forEach(img => {
|
|
104
|
+
if (img.tagName.toLowerCase() === 'img' && img.getAttribute('alt')) {
|
|
105
|
+
const altText = img.getAttribute('alt');
|
|
106
|
+
if (altText) {
|
|
107
|
+
img.replaceWith(altText);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
});
|
|
111
|
+
const tweetTextEl = tweetClone.querySelector('[data-testid="tweetText"]');
|
|
112
|
+
const tweetText = tweetTextEl ? (0, dom_1.serializeHTML)(tweetTextEl) : '';
|
|
113
|
+
const formattedText = this.formatTweetText(tweetText);
|
|
114
|
+
const images = this.extractImages(tweet);
|
|
115
|
+
// Get author info and date
|
|
116
|
+
const userInfo = this.extractUserInfo(tweet);
|
|
117
|
+
// Extract quoted tweet if present
|
|
118
|
+
const quotedTweet = tweet.querySelector('[aria-labelledby*="id__"]')?.querySelector('[data-testid="User-Name"]')?.closest('[aria-labelledby*="id__"]');
|
|
119
|
+
const quotedContent = quotedTweet ? this.extractTweet(quotedTweet) : '';
|
|
120
|
+
return `
|
|
121
|
+
<div class="tweet">
|
|
122
|
+
<div class="tweet-header">
|
|
123
|
+
<span class="tweet-author"><strong>${userInfo.fullName}</strong> <span class="tweet-handle">${userInfo.handle}</span></span>
|
|
124
|
+
${userInfo.date ? `<a href="${userInfo.permalink}" class="tweet-date">${userInfo.date}</a>` : ''}
|
|
125
|
+
</div>
|
|
126
|
+
${formattedText ? `<div class="tweet-text">${formattedText}</div>` : ''}
|
|
127
|
+
${images.length ? `
|
|
128
|
+
<div class="tweet-media">
|
|
129
|
+
${images.join('\n')}
|
|
130
|
+
</div>
|
|
131
|
+
` : ''}
|
|
132
|
+
${quotedContent ? `
|
|
133
|
+
<blockquote class="quoted-tweet">
|
|
134
|
+
${quotedContent}
|
|
135
|
+
</blockquote>
|
|
136
|
+
` : ''}
|
|
137
|
+
</div>
|
|
138
|
+
`.trim();
|
|
139
|
+
}
|
|
140
|
+
extractUserInfo(tweet) {
|
|
141
|
+
const nameElement = tweet.querySelector('[data-testid="User-Name"]');
|
|
142
|
+
if (!nameElement)
|
|
143
|
+
return { fullName: '', handle: '', date: '', permalink: '' };
|
|
144
|
+
// Try to get name and handle from links first (main tweet structure)
|
|
145
|
+
const links = nameElement.querySelectorAll('a');
|
|
146
|
+
let fullName = links?.[0]?.textContent?.trim() || '';
|
|
147
|
+
let handle = links?.[1]?.textContent?.trim() || '';
|
|
148
|
+
// If links don't have the info, try to get from spans (quoted tweet structure)
|
|
149
|
+
if (!fullName || !handle) {
|
|
150
|
+
fullName = nameElement.querySelector('span[style*="color: rgb(15, 20, 25)"] span')?.textContent?.trim() || '';
|
|
151
|
+
handle = nameElement.querySelector('span[style*="color: rgb(83, 100, 113)"]')?.textContent?.trim() || '';
|
|
152
|
+
}
|
|
153
|
+
const timestamp = tweet.querySelector('time');
|
|
154
|
+
const datetime = timestamp?.getAttribute('datetime') || '';
|
|
155
|
+
const date = datetime ? new Date(datetime).toISOString().split('T')[0] : '';
|
|
156
|
+
const permalink = timestamp?.closest('a')?.href || '';
|
|
157
|
+
return { fullName, handle, date, permalink };
|
|
158
|
+
}
|
|
159
|
+
extractImages(tweet) {
|
|
160
|
+
// Look for images in different containers
|
|
161
|
+
const imageContainers = [
|
|
162
|
+
'[data-testid="tweetPhoto"]',
|
|
163
|
+
'[data-testid="tweet-image"]',
|
|
164
|
+
'img[src*="media"]'
|
|
165
|
+
];
|
|
166
|
+
const images = [];
|
|
167
|
+
// Skip images that are inside quoted tweets
|
|
168
|
+
const quotedTweet = tweet.querySelector('[aria-labelledby*="id__"]')?.querySelector('[data-testid="User-Name"]')?.closest('[aria-labelledby*="id__"]');
|
|
169
|
+
for (const selector of imageContainers) {
|
|
170
|
+
const elements = tweet.querySelectorAll(selector);
|
|
171
|
+
elements.forEach(img => {
|
|
172
|
+
// Skip if the image is inside a quoted tweet
|
|
173
|
+
if (quotedTweet?.contains(img)) {
|
|
174
|
+
return;
|
|
175
|
+
}
|
|
176
|
+
// Check if element is an image by checking tag name and required properties
|
|
177
|
+
if (img.tagName.toLowerCase() === 'img' && img.getAttribute('alt')) {
|
|
178
|
+
const highQualitySrc = img.getAttribute('src')?.replace(/&name=\w+$/, '&name=large') || '';
|
|
179
|
+
const cleanAlt = img.getAttribute('alt')?.replace(/\s+/g, ' ').trim() || '';
|
|
180
|
+
images.push(`<img src="${highQualitySrc}" alt="${cleanAlt}" />`);
|
|
181
|
+
}
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
return images;
|
|
185
|
+
}
|
|
186
|
+
getTweetId() {
|
|
187
|
+
const match = this.url.match(/status\/(\d+)/);
|
|
188
|
+
return match?.[1] || '';
|
|
189
|
+
}
|
|
190
|
+
getTweetAuthor() {
|
|
191
|
+
const nameElement = this.mainTweet?.querySelector('[data-testid="User-Name"]');
|
|
192
|
+
const links = nameElement?.querySelectorAll('a');
|
|
193
|
+
const handle = links?.[1]?.textContent?.trim() || '';
|
|
194
|
+
return handle.startsWith('@') ? handle : `@${handle}`;
|
|
195
|
+
}
|
|
196
|
+
createDescription(tweet) {
|
|
197
|
+
if (!tweet)
|
|
198
|
+
return '';
|
|
199
|
+
const tweetText = tweet.querySelector('[data-testid="tweetText"]')?.textContent || '';
|
|
200
|
+
return tweetText.trim().slice(0, 140).replace(/\s+/g, ' ');
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
exports.TwitterExtractor = TwitterExtractor;
|
|
204
|
+
//# sourceMappingURL=twitter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"twitter.js","sourceRoot":"","sources":["../../src/extractors/twitter.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,sCAAwD;AAExD,MAAa,gBAAiB,SAAQ,qBAAa;IAIlD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAJd,cAAS,GAAmB,IAAI,CAAC;QACjC,iBAAY,GAAc,EAAE,CAAC;QAKpC,mCAAmC;QACnC,MAAM,QAAQ,GAAG,QAAQ,CAAC,aAAa,CAAC,uCAAuC,CAAC,CAAC;QACjF,IAAI,CAAC,QAAQ,EAAE,CAAC;YACf,qDAAqD;YACrD,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,8BAA8B,CAAC,CAAC;YAC3E,IAAI,WAAW,EAAE,CAAC;gBACjB,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC;YAC9B,CAAC;YACD,OAAO;QACR,CAAC;QAED,6EAA6E;QAC7E,IAAI,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,8BAA8B,CAAC,CAAC,CAAC;QACtF,MAAM,YAAY,GAAG,QAAQ,CAAC,aAAa,CAAC,aAAa,CAAC,EAAE,aAAa,CAAC;QAE1E,IAAI,YAAY,EAAE,CAAC;YAClB,wDAAwD;YACxD,MAAM,WAAW,GAAG,SAAS,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAC/C,YAAY,CAAC,uBAAuB,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,2BAA2B,CAC9E,CAAC;YACF,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC,CAAC;YAC7C,CAAC;QACF,CAAC;QAED,mCAAmC;QACnC,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACtC,IAAI,CAAC,YAAY,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACxC,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;IACzB,CAAC;IAED,OAAO;QACN,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACtD,MAAM,aAAa,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,KAAK;YAC1D,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC;YAC3E,CAAC,CAAC,EAAE,CAAC;QAEN,MAAM,WAAW,GAAG;;;OAGf,WAAW;;MAEZ,aAAa,CAAC,CAAC,CAAC;;;QAGd,aAAa;;KAEhB,CAAC,CAAC,CAAC,EAAE;;GAEP,CAAC,IAAI,EAAE,CAAC;QAET,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,EAAE,CAAC;QAClC,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAE3D,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,OAAO;gBACP,WAAW;aACX;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,aAAa,WAAW,EAAE;gBACjC,MAAM,EAAE,WAAW;gBACnB,IAAI,EAAE,aAAa;gBACnB,WAAW;aACX;SACD,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,IAAY;QACnC,IAAI,CAAC,IAAI;YAAE,OAAO,EAAE,CAAC;QAErB,qDAAqD;QACrD,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACnD,OAAO,CAAC,WAAW,CAAC,IAAA,eAAS,EAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC;QAEpD,6CAA6C;QAC7C,OAAO,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;YAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC9C,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,2DAA2D;QAC3D,OAAO,CAAC,gBAAgB,CAAC,WAAW,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE;YACvD,OAAO,CAAC,WAAW,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC;QACxD,CAAC,CAAC,CAAC;QAEH,6CAA6C;QAC7C,MAAM,SAAS,GAAG,IAAA,mBAAa,EAAC,OAAO,CAAC,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC;aACtC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;aACxB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;QAEvB,kCAAkC;QAClC,OAAO,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtD,CAAC;IAEO,YAAY,CAAC,KAAqB;QACzC,IAAI,CAAC,KAAK;YAAE,OAAO,EAAE,CAAC;QAEtB,uCAAuC;QACvC,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;QAEpD,+BAA+B;QAC/B,UAAU,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YAChE,IAAI,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,KAAK,IAAI,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;gBACpE,MAAM,OAAO,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;gBACxC,IAAI,OAAO,EAAE,CAAC;oBACb,GAAG,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;gBAC1B,CAAC;YACF,CAAC;QACF,CAAC,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,UAAU,CAAC,aAAa,CAAC,2BAA2B,CAAC,CAAC;QAC1E,MAAM,SAAS,GAAG,WAAW,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAChE,MAAM,aAAa,GAAG,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACtD,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QAEzC,2BAA2B;QAC3B,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QAE7C,kCAAkC;QAClC,MAAM,WAAW,GAAG,KAAK,CAAC,aAAa,CAAC,2BAA2B,CAAC,EAAE,aAAa,CAAC,2BAA2B,CAAC,EAAE,OAAO,CAAC,2BAA2B,CAAC,CAAC;QACvJ,MAAM,aAAa,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAExE,OAAO;;;0CAGiC,QAAQ,CAAC,QAAQ,wCAAwC,QAAQ,CAAC,MAAM;OAC3G,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,YAAY,QAAQ,CAAC,SAAS,wBAAwB,QAAQ,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE;;MAE/F,aAAa,CAAC,CAAC,CAAC,2BAA2B,aAAa,QAAQ,CAAC,CAAC,CAAC,EAAE;MACrE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;;QAEd,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;;KAEpB,CAAC,CAAC,CAAC,EAAE;MACJ,aAAa,CAAC,CAAC,CAAC;;QAEd,aAAa;;KAEhB,CAAC,CAAC,CAAC,EAAE;;GAEP,CAAC,IAAI,EAAE,CAAC;IACV,CAAC;IAEO,eAAe,CAAC,KAAc;QACrC,MAAM,WAAW,GAAG,KAAK,CAAC,aAAa,CAAC,2BAA2B,CAAC,CAAC;QACrE,IAAI,CAAC,WAAW;YAAE,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;QAE/E,qEAAqE;QACrE,MAAM,KAAK,GAAG,WAAW,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;QAChD,IAAI,QAAQ,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACrD,IAAI,MAAM,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAEnD,+EAA+E;QAC/E,IAAI,CAAC,QAAQ,IAAI,CAAC,MAAM,EAAE,CAAC;YAC1B,QAAQ,GAAG,WAAW,CAAC,aAAa,CAAC,4CAA4C,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC9G,MAAM,GAAG,WAAW,CAAC,aAAa,CAAC,yCAAyC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC1G,CAAC;QAED,MAAM,SAAS,GAAG,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAC9C,MAAM,QAAQ,GAAG,SAAS,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;QAC3D,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC5E,MAAM,SAAS,GAAG,SAAS,EAAE,OAAO,CAAC,GAAG,CAAC,EAAE,IAAI,IAAI,EAAE,CAAC;QAEtD,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;IAC9C,CAAC;IAEO,aAAa,CAAC,KAAc;QACnC,0CAA0C;QAC1C,MAAM,eAAe,GAAG;YACvB,4BAA4B;YAC5B,6BAA6B;YAC7B,mBAAmB;SACnB,CAAC;QAEF,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,4CAA4C;QAC5C,MAAM,WAAW,GAAG,KAAK,CAAC,aAAa,CAAC,2BAA2B,CAAC,EAAE,aAAa,CAAC,2BAA2B,CAAC,EAAE,OAAO,CAAC,2BAA2B,CAAC,CAAC;QAEvJ,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;YACxC,MAAM,QAAQ,GAAG,KAAK,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YAElD,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;gBACtB,6CAA6C;gBAC7C,IAAI,WAAW,EAAE,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;oBAChC,OAAO;gBACR,CAAC;gBAED,4EAA4E;gBAC5E,IAAI,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,KAAK,IAAI,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;oBACpE,MAAM,cAAc,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,YAAY,EAAE,aAAa,CAAC,IAAI,EAAE,CAAC;oBAC3F,MAAM,QAAQ,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;oBAC5E,MAAM,CAAC,IAAI,CAAC,aAAa,cAAc,UAAU,QAAQ,MAAM,CAAC,CAAC;gBAClE,CAAC;YACF,CAAC,CAAC,CAAC;QACJ,CAAC;QAED,OAAO,MAAM,CAAC;IACf,CAAC;IAEO,UAAU;QACjB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC;QAC9C,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAEO,cAAc;QACrB,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,2BAA2B,CAAC,CAAC;QAC/E,MAAM,KAAK,GAAG,WAAW,EAAE,gBAAgB,CAAC,GAAG,CAAC,CAAC;QACjD,MAAM,MAAM,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACrD,OAAO,MAAM,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,MAAM,EAAE,CAAC;IACvD,CAAC;IAEO,iBAAiB,CAAC,KAAqB;QAC9C,IAAI,CAAC,KAAK;YAAE,OAAO,EAAE,CAAC;QAEtB,MAAM,SAAS,GAAG,KAAK,CAAC,aAAa,CAAC,2BAA2B,CAAC,EAAE,WAAW,IAAI,EAAE,CAAC;QACtF,OAAO,SAAS,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC5D,CAAC;CACD;AAzOD,4CAyOC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { BaseExtractor } from './_base';
|
|
2
|
+
import { ExtractorResult } from '../types/extractors';
|
|
3
|
+
export declare class XArticleExtractor extends BaseExtractor {
|
|
4
|
+
private articleContainer;
|
|
5
|
+
constructor(document: Document, url: string, schemaOrgData?: any);
|
|
6
|
+
canExtract(): boolean;
|
|
7
|
+
extract(): ExtractorResult;
|
|
8
|
+
private extractTitle;
|
|
9
|
+
private extractAuthor;
|
|
10
|
+
private getAuthorFromUrl;
|
|
11
|
+
private getAuthorFromOgTitle;
|
|
12
|
+
private getArticleId;
|
|
13
|
+
private extractContent;
|
|
14
|
+
private cleanContent;
|
|
15
|
+
private convertEmbeddedTweets;
|
|
16
|
+
private convertCodeBlocks;
|
|
17
|
+
private convertHeaders;
|
|
18
|
+
private unwrapLinkedImages;
|
|
19
|
+
private upgradeImageQuality;
|
|
20
|
+
private convertDraftParagraphs;
|
|
21
|
+
private convertBoldSpans;
|
|
22
|
+
private removeDraftAttributes;
|
|
23
|
+
private createDescription;
|
|
24
|
+
}
|