@mz1999/defuddle 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +371 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +145 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +24 -0
- package/dist/constants.js +950 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +136 -0
- package/dist/defuddle.js +1816 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.d.ts +5 -0
- package/dist/elements/code.js +346 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/footnotes.d.ts +5 -0
- package/dist/elements/footnotes.js +619 -0
- package/dist/elements/footnotes.js.map +1 -0
- package/dist/elements/headings.d.ts +11 -0
- package/dist/elements/headings.js +100 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +877 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.base.d.ts +9 -0
- package/dist/elements/math.base.js +195 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.core.d.ts +7 -0
- package/dist/elements/math.core.js +52 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.d.ts +2 -0
- package/dist/elements/math.full.d.ts +8 -0
- package/dist/elements/math.js +7 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/extractor-registry.d.ts +16 -0
- package/dist/extractor-registry.js +140 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +22 -0
- package/dist/extractors/_base.js +27 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +78 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +14 -0
- package/dist/extractors/chatgpt.js +138 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +91 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/gemini.d.ts +14 -0
- package/dist/extractors/gemini.js +111 -0
- package/dist/extractors/gemini.js.map +1 -0
- package/dist/extractors/github.d.ts +20 -0
- package/dist/extractors/github.js +251 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +142 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +155 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +22 -0
- package/dist/extractors/reddit.js +197 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +204 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/x-article.d.ts +24 -0
- package/dist/extractors/x-article.js +267 -0
- package/dist/extractors/x-article.js.map +1 -0
- package/dist/extractors/x-oembed.d.ts +20 -0
- package/dist/extractors/x-oembed.js +350 -0
- package/dist/extractors/x-oembed.js.map +1 -0
- package/dist/extractors/youtube.d.ts +87 -0
- package/dist/extractors/youtube.js +869 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/fetch.d.ts +18 -0
- package/dist/fetch.js +265 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.full.d.ts +12 -0
- package/dist/index.full.js +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +30 -0
- package/dist/markdown.js +661 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.d.ts +25 -0
- package/dist/metadata.js +426 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +19 -0
- package/dist/node.js +78 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +31 -0
- package/dist/scoring.js +472 -0
- package/dist/scoring.js.map +1 -0
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +1101 -0
- package/dist/standardize.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +135 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +42 -0
- package/dist/utils/dom.js +104 -0
- package/dist/utils/dom.js.map +1 -0
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +98 -0
- package/dist/utils.js.map +1 -0
- package/package.json +107 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.XArticleExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
const SELECTORS = {
|
|
7
|
+
ARTICLE_CONTAINER: '[data-testid="twitterArticleRichTextView"]',
|
|
8
|
+
TITLE: '[data-testid="twitter-article-title"]',
|
|
9
|
+
AUTHOR: '[itemprop="author"]',
|
|
10
|
+
AUTHOR_NAME: 'meta[itemprop="name"]',
|
|
11
|
+
AUTHOR_HANDLE: 'meta[itemprop="additionalName"]',
|
|
12
|
+
IMAGES: '[data-testid="tweetPhoto"] img',
|
|
13
|
+
DRAFT_PARAGRAPHS: '.longform-unstyled, .public-DraftStyleDefault-block',
|
|
14
|
+
BOLD_SPANS: 'span[style*="font-weight: bold"]',
|
|
15
|
+
DRAFT_ATTRIBUTES: '[data-offset-key]',
|
|
16
|
+
EMBEDDED_TWEET: '[data-testid="simpleTweet"]',
|
|
17
|
+
TWEET_TEXT: '[data-testid="tweetText"]',
|
|
18
|
+
USER_NAME: '[data-testid="User-Name"]',
|
|
19
|
+
CODE_BLOCK: '[data-testid="markdown-code-block"]',
|
|
20
|
+
HEADER_BLOCK: '[data-testid="longform-header"]',
|
|
21
|
+
};
|
|
22
|
+
class XArticleExtractor extends _base_1.BaseExtractor {
|
|
23
|
+
constructor(document, url, schemaOrgData) {
|
|
24
|
+
super(document, url, schemaOrgData);
|
|
25
|
+
this.articleContainer = document.querySelector(SELECTORS.ARTICLE_CONTAINER);
|
|
26
|
+
}
|
|
27
|
+
canExtract() {
|
|
28
|
+
return !!this.articleContainer;
|
|
29
|
+
}
|
|
30
|
+
extract() {
|
|
31
|
+
const title = this.extractTitle();
|
|
32
|
+
const author = this.extractAuthor();
|
|
33
|
+
const contentHtml = this.extractContent();
|
|
34
|
+
const description = this.createDescription();
|
|
35
|
+
return {
|
|
36
|
+
content: contentHtml,
|
|
37
|
+
contentHtml,
|
|
38
|
+
extractedContent: {
|
|
39
|
+
articleId: this.getArticleId(),
|
|
40
|
+
},
|
|
41
|
+
variables: {
|
|
42
|
+
title,
|
|
43
|
+
author,
|
|
44
|
+
site: 'X (Twitter)',
|
|
45
|
+
description,
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
extractTitle() {
|
|
50
|
+
const titleEl = this.document.querySelector(SELECTORS.TITLE);
|
|
51
|
+
return titleEl?.textContent?.trim() || 'Untitled X Article';
|
|
52
|
+
}
|
|
53
|
+
extractAuthor() {
|
|
54
|
+
const authorContainer = this.document.querySelector(SELECTORS.AUTHOR);
|
|
55
|
+
if (!authorContainer)
|
|
56
|
+
return this.getAuthorFromUrl();
|
|
57
|
+
const name = authorContainer.querySelector(SELECTORS.AUTHOR_NAME)?.getAttribute('content');
|
|
58
|
+
const handle = authorContainer.querySelector(SELECTORS.AUTHOR_HANDLE)?.getAttribute('content');
|
|
59
|
+
if (name && handle)
|
|
60
|
+
return `${name} (@${handle})`;
|
|
61
|
+
return name || handle || this.getAuthorFromUrl();
|
|
62
|
+
}
|
|
63
|
+
getAuthorFromUrl() {
|
|
64
|
+
// match username before /article/, excluding system paths like /i/
|
|
65
|
+
const match = this.url.match(/\/([a-zA-Z][a-zA-Z0-9_]{0,14})\/article\/\d+/);
|
|
66
|
+
return match ? `@${match[1]}` : this.getAuthorFromOgTitle();
|
|
67
|
+
}
|
|
68
|
+
getAuthorFromOgTitle() {
|
|
69
|
+
const ogTitle = this.document.querySelector('meta[property="og:title"]')?.getAttribute('content') || '';
|
|
70
|
+
// Match patterns like "(4) Heinrich on X: ..." or "Heinrich on X: ..."
|
|
71
|
+
const match = ogTitle.match(/^(?:\(\d+\)\s+)?(.+?)\s+on\s+X\s*:/);
|
|
72
|
+
return match ? match[1].trim() : 'Unknown';
|
|
73
|
+
}
|
|
74
|
+
getArticleId() {
|
|
75
|
+
const match = this.url.match(/article\/(\d+)/);
|
|
76
|
+
return match ? match[1] : '';
|
|
77
|
+
}
|
|
78
|
+
extractContent() {
|
|
79
|
+
if (!this.articleContainer)
|
|
80
|
+
return '';
|
|
81
|
+
const clone = this.articleContainer.cloneNode(true);
|
|
82
|
+
this.cleanContent(clone);
|
|
83
|
+
return `<article class="x-article">${(0, dom_1.serializeHTML)(clone)}</article>`;
|
|
84
|
+
}
|
|
85
|
+
cleanContent(container) {
|
|
86
|
+
const ownerDoc = container.ownerDocument || this.document;
|
|
87
|
+
// convert complex elements first (before other transformations)
|
|
88
|
+
this.convertEmbeddedTweets(container, ownerDoc);
|
|
89
|
+
this.convertCodeBlocks(container, ownerDoc);
|
|
90
|
+
this.convertHeaders(container, ownerDoc);
|
|
91
|
+
this.unwrapLinkedImages(container, ownerDoc);
|
|
92
|
+
this.upgradeImageQuality(container);
|
|
93
|
+
// convert bold spans BEFORE paragraphs so formatting is preserved
|
|
94
|
+
this.convertBoldSpans(container, ownerDoc);
|
|
95
|
+
this.convertDraftParagraphs(container, ownerDoc);
|
|
96
|
+
this.removeDraftAttributes(container);
|
|
97
|
+
}
|
|
98
|
+
convertEmbeddedTweets(container, ownerDoc) {
|
|
99
|
+
container.querySelectorAll(SELECTORS.EMBEDDED_TWEET).forEach(tweet => {
|
|
100
|
+
const blockquote = ownerDoc.createElement('blockquote');
|
|
101
|
+
blockquote.className = 'embedded-tweet';
|
|
102
|
+
// extract author info
|
|
103
|
+
const userNameEl = tweet.querySelector(SELECTORS.USER_NAME);
|
|
104
|
+
const authorLinks = userNameEl?.querySelectorAll('a');
|
|
105
|
+
const fullName = authorLinks?.[0]?.textContent?.trim() || '';
|
|
106
|
+
const handle = authorLinks?.[1]?.textContent?.trim() || '';
|
|
107
|
+
// extract tweet text
|
|
108
|
+
const tweetTextEl = tweet.querySelector(SELECTORS.TWEET_TEXT);
|
|
109
|
+
const tweetText = tweetTextEl?.textContent?.trim() || '';
|
|
110
|
+
// build clean blockquote content
|
|
111
|
+
if (fullName || handle) {
|
|
112
|
+
const cite = ownerDoc.createElement('cite');
|
|
113
|
+
cite.textContent = handle ? `${fullName} ${handle}` : fullName;
|
|
114
|
+
blockquote.appendChild(cite);
|
|
115
|
+
}
|
|
116
|
+
if (tweetText) {
|
|
117
|
+
const p = ownerDoc.createElement('p');
|
|
118
|
+
p.textContent = tweetText;
|
|
119
|
+
blockquote.appendChild(p);
|
|
120
|
+
}
|
|
121
|
+
tweet.replaceWith(blockquote);
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
convertCodeBlocks(container, ownerDoc) {
|
|
125
|
+
container.querySelectorAll(SELECTORS.CODE_BLOCK).forEach(block => {
|
|
126
|
+
const pre = block.querySelector('pre');
|
|
127
|
+
const code = block.querySelector('code');
|
|
128
|
+
if (!pre || !code)
|
|
129
|
+
return;
|
|
130
|
+
// extract language from class (e.g., "language-bash") or from span
|
|
131
|
+
let language = '';
|
|
132
|
+
const langClass = code.className.match(/language-(\w+)/);
|
|
133
|
+
if (langClass) {
|
|
134
|
+
language = langClass[1];
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
// fallback: look for language label in the block header
|
|
138
|
+
const langSpan = block.querySelector('span');
|
|
139
|
+
language = langSpan?.textContent?.trim() || '';
|
|
140
|
+
}
|
|
141
|
+
// create clean pre/code structure
|
|
142
|
+
const newPre = ownerDoc.createElement('pre');
|
|
143
|
+
const newCode = ownerDoc.createElement('code');
|
|
144
|
+
if (language) {
|
|
145
|
+
newCode.setAttribute('data-lang', language);
|
|
146
|
+
newCode.className = `language-${language}`;
|
|
147
|
+
}
|
|
148
|
+
newCode.textContent = code.textContent || '';
|
|
149
|
+
newPre.appendChild(newCode);
|
|
150
|
+
// replace the entire block container
|
|
151
|
+
block.replaceWith(newPre);
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
convertHeaders(container, ownerDoc) {
|
|
155
|
+
// X articles use h2/h3 elements but content may be nested in spans/divs
|
|
156
|
+
container.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach(header => {
|
|
157
|
+
const level = header.tagName.toLowerCase();
|
|
158
|
+
const text = header.textContent?.trim() || '';
|
|
159
|
+
if (!text)
|
|
160
|
+
return;
|
|
161
|
+
const newHeader = ownerDoc.createElement(level);
|
|
162
|
+
newHeader.textContent = text;
|
|
163
|
+
header.replaceWith(newHeader);
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
unwrapLinkedImages(container, ownerDoc) {
|
|
167
|
+
// find all tweetPhoto images and extract them from any ancestor anchors
|
|
168
|
+
container.querySelectorAll(SELECTORS.IMAGES).forEach(img => {
|
|
169
|
+
// find closest anchor ancestor
|
|
170
|
+
const anchor = img.closest('a');
|
|
171
|
+
if (!anchor || !container.contains(anchor))
|
|
172
|
+
return;
|
|
173
|
+
// create clean img tag with upgraded quality (like TwitterExtractor does)
|
|
174
|
+
let src = img.getAttribute('src') || '';
|
|
175
|
+
const alt = img.getAttribute('alt')?.replace(/\s+/g, ' ').trim() || 'Image';
|
|
176
|
+
// upgrade image quality
|
|
177
|
+
if (src.includes('&name=')) {
|
|
178
|
+
src = src.replace(/&name=\w+/, '&name=large');
|
|
179
|
+
}
|
|
180
|
+
else if (src.includes('?')) {
|
|
181
|
+
src = `${src}&name=large`;
|
|
182
|
+
}
|
|
183
|
+
else {
|
|
184
|
+
src = `${src}?name=large`;
|
|
185
|
+
}
|
|
186
|
+
const cleanImg = ownerDoc.createElement('img');
|
|
187
|
+
cleanImg.setAttribute('src', src);
|
|
188
|
+
cleanImg.setAttribute('alt', alt);
|
|
189
|
+
// replace anchor with clean image
|
|
190
|
+
anchor.replaceWith(cleanImg);
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
upgradeImageQuality(container) {
|
|
194
|
+
container.querySelectorAll(SELECTORS.IMAGES).forEach(img => {
|
|
195
|
+
const src = img.getAttribute('src');
|
|
196
|
+
if (!src)
|
|
197
|
+
return;
|
|
198
|
+
if (src.includes('&name=')) {
|
|
199
|
+
img.setAttribute('src', src.replace(/&name=\w+/, '&name=large'));
|
|
200
|
+
}
|
|
201
|
+
else if (src.includes('?')) {
|
|
202
|
+
img.setAttribute('src', `${src}&name=large`);
|
|
203
|
+
}
|
|
204
|
+
else {
|
|
205
|
+
img.setAttribute('src', `${src}?name=large`);
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
convertDraftParagraphs(container, ownerDoc) {
|
|
210
|
+
// node type constants (avoid using Node global which isn't available in all environments)
|
|
211
|
+
const TEXT_NODE = 3;
|
|
212
|
+
const ELEMENT_NODE = 1;
|
|
213
|
+
container.querySelectorAll(SELECTORS.DRAFT_PARAGRAPHS).forEach(div => {
|
|
214
|
+
const p = ownerDoc.createElement('p');
|
|
215
|
+
// preserve formatting (strong, links, code) by processing children
|
|
216
|
+
const processNode = (node) => {
|
|
217
|
+
if (node.nodeType === TEXT_NODE) {
|
|
218
|
+
p.appendChild(ownerDoc.createTextNode(node.textContent || ''));
|
|
219
|
+
}
|
|
220
|
+
else if (node.nodeType === ELEMENT_NODE) {
|
|
221
|
+
const el = node;
|
|
222
|
+
const tag = el.tagName.toLowerCase();
|
|
223
|
+
if (tag === 'strong') {
|
|
224
|
+
const strong = ownerDoc.createElement('strong');
|
|
225
|
+
strong.textContent = el.textContent || '';
|
|
226
|
+
p.appendChild(strong);
|
|
227
|
+
}
|
|
228
|
+
else if (tag === 'a') {
|
|
229
|
+
const link = ownerDoc.createElement('a');
|
|
230
|
+
link.setAttribute('href', el.getAttribute('href') || '');
|
|
231
|
+
link.textContent = el.textContent || '';
|
|
232
|
+
p.appendChild(link);
|
|
233
|
+
}
|
|
234
|
+
else if (tag === 'code') {
|
|
235
|
+
const code = ownerDoc.createElement('code');
|
|
236
|
+
code.textContent = el.textContent || '';
|
|
237
|
+
p.appendChild(code);
|
|
238
|
+
}
|
|
239
|
+
else {
|
|
240
|
+
// recurse into other elements (spans, divs, etc.)
|
|
241
|
+
el.childNodes.forEach(child => processNode(child));
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
};
|
|
245
|
+
div.childNodes.forEach(child => processNode(child));
|
|
246
|
+
div.replaceWith(p);
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
convertBoldSpans(container, ownerDoc) {
|
|
250
|
+
container.querySelectorAll(SELECTORS.BOLD_SPANS).forEach(span => {
|
|
251
|
+
const strong = ownerDoc.createElement('strong');
|
|
252
|
+
strong.textContent = span.textContent || '';
|
|
253
|
+
span.replaceWith(strong);
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
removeDraftAttributes(container) {
|
|
257
|
+
container.querySelectorAll(SELECTORS.DRAFT_ATTRIBUTES).forEach(el => {
|
|
258
|
+
el.removeAttribute('data-offset-key');
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
createDescription() {
|
|
262
|
+
const text = this.articleContainer?.textContent?.trim() || '';
|
|
263
|
+
return text.slice(0, 140) + (text.length > 140 ? '...' : '');
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
exports.XArticleExtractor = XArticleExtractor;
|
|
267
|
+
//# sourceMappingURL=x-article.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"x-article.js","sourceRoot":"","sources":["../../src/extractors/x-article.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,sCAA6C;AAE7C,MAAM,SAAS,GAAG;IACjB,iBAAiB,EAAE,4CAA4C;IAC/D,KAAK,EAAE,uCAAuC;IAC9C,MAAM,EAAE,qBAAqB;IAC7B,WAAW,EAAE,uBAAuB;IACpC,aAAa,EAAE,iCAAiC;IAChD,MAAM,EAAE,gCAAgC;IACxC,gBAAgB,EAAE,qDAAqD;IACvE,UAAU,EAAE,kCAAkC;IAC9C,gBAAgB,EAAE,mBAAmB;IACrC,cAAc,EAAE,6BAA6B;IAC7C,UAAU,EAAE,2BAA2B;IACvC,SAAS,EAAE,2BAA2B;IACtC,UAAU,EAAE,qCAAqC;IACjD,YAAY,EAAE,iCAAiC;CACtC,CAAC;AAEX,MAAa,iBAAkB,SAAQ,qBAAa;IAGnD,YAAY,QAAkB,EAAE,GAAW,EAAE,aAAmB;QAC/D,KAAK,CAAC,QAAQ,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;QACpC,IAAI,CAAC,gBAAgB,GAAG,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAC7E,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC;IAChC,CAAC;IAED,OAAO;QACN,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QACpC,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAE7C,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW;YACX,gBAAgB,EAAE;gBACjB,SAAS,EAAE,IAAI,CAAC,YAAY,EAAE;aAC9B;YACD,SAAS,EAAE;gBACV,KAAK;gBACL,MAAM;gBACN,IAAI,EAAE,aAAa;gBACnB,WAAW;aACX;SACD,CAAC;IACH,CAAC;IAEO,YAAY;QACnB,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAC7D,OAAO,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,oBAAoB,CAAC;IAC7D,CAAC;IAEO,aAAa;QACpB,MAAM,eAAe,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACtE,IAAI,CAAC,eAAe;YAAE,OAAO,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAErD,MAAM,IAAI,GAAG,eAAe,CAAC,aAAa,CAAC,SAAS,CAAC,WAAW,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QAC3F,MAAM,MAAM,GAAG,eAAe,CAAC,aAAa,CAAC,SAAS,CAAC,aAAa,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QAE/F,IAAI,IAAI,IAAI,MAAM;YAAE,OAAO,GAAG,IAAI,MAAM,MAAM,GAAG,CAAC;QAClD,OAAO,IAAI,IAAI,MAAM,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;IAClD,CAAC;IAEO,gBAAgB;QACvB,mEAAmE;QACnE,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,8CAA8C,CAAC,CAAC;QAC7E,OAAO,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC;IAC7D,CAAC;IAEO,oBAAoB;QAC3B,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,2BAA2B,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QACxG,uEAAuE;QACvE,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,oCAAoC,CAAC,CAAC;QAClE,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAC5C,CAAC;IAEO,YAAY;QACnB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;QAC/C,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAC9B,CAAC;IAEO,cAAc;QACrB,IAAI,CAAC,IAAI,CAAC,gBAAgB;YAAE,OAAO,EAAE,CAAC;QAEtC,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,IAAI,CAAgB,CAAC;QACnE,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QAEzB,OAAO,8BAA8B,IAAA,mBAAa,EAAC,KAAK,CAAC,YAAY,CAAC;IACvE,CAAC;IAEO,YAAY,CAAC,SAAsB;QAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,aAAa,IAAI,IAAI,CAAC,QAAQ,CAAC;QAE1D,gEAAgE;QAChE,IAAI,CAAC,qBAAqB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAChD,IAAI,CAAC,iBAAiB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAC5C,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QACzC,IAAI,CAAC,kBAAkB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAC7C,IAAI,CAAC,mBAAmB,CAAC,SAAS,CAAC,CAAC;QACpC,kEAAkE;QAClE,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAC3C,IAAI,CAAC,sBAAsB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QACjD,IAAI,CAAC,qBAAqB,CAAC,SAAS,CAAC,CAAC;IACvC,CAAC;IAEO,qBAAqB,CAAC,SAAsB,EAAE,QAAkB;QACvE,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACpE,MAAM,UAAU,GAAG,QAAQ,CAAC,aAAa,CAAC,YAAY,CAAC,CAAC;YACxD,UAAU,CAAC,SAAS,GAAG,gBAAgB,CAAC;YAExC,sBAAsB;YACtB,MAAM,UAAU,GAAG,KAAK,CAAC,aAAa,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC;YAC5D,MAAM,WAAW,GAAG,UAAU,EAAE,gBAAgB,CAAC,GAAG,CAAC,CAAC;YACtD,MAAM,QAAQ,GAAG,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC7D,MAAM,MAAM,GAAG,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAE3D,qBAAqB;YACrB,MAAM,WAAW,GAAG,KAAK,CAAC,aAAa,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;YAC9D,MAAM,SAAS,GAAG,WAAW,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAEzD,iCAAiC;YACjC,IAAI,QAAQ,IAAI,MAAM,EAAE,CAAC;gBACxB,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;gBAC5C,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,CAAC,CAAC,GAAG,QAAQ,IAAI,MAAM,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;gBAC/D,UAAU,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;YAC9B,CAAC;YAED,IAAI,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,GAAG,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;gBACtC,CAAC,CAAC,WAAW,GAAG,SAAS,CAAC;gBAC1B,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;YAC3B,CAAC;YAED,KAAK,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,iBAAiB,CAAC,SAAsB,EAAE,QAAkB;QACnE,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YAChE,MAAM,GAAG,GAAG,KAAK,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YACvC,MAAM,IAAI,GAAG,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YACzC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI;gBAAE,OAAO;YAE1B,mEAAmE;YACnE,IAAI,QAAQ,GAAG,EAAE,CAAC;YAClB,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;YACzD,IAAI,SAAS,EAAE,CAAC;gBACf,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YACzB,CAAC;iBAAM,CAAC;gBACP,wDAAwD;gBACxD,MAAM,QAAQ,GAAG,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;gBAC7C,QAAQ,GAAG,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAChD,CAAC;YAED,kCAAkC;YAClC,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC/C,IAAI,QAAQ,EAAE,CAAC;gBACd,OAAO,CAAC,YAAY,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;gBAC5C,OAAO,CAAC,SAAS,GAAG,YAAY,QAAQ,EAAE,CAAC;YAC5C,CAAC;YACD,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;YAC7C,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;YAE5B,qCAAqC;YACrC,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAC3B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,cAAc,CAAC,SAAsB,EAAE,QAAkB;QAChE,wEAAwE;QACxE,SAAS,CAAC,gBAAgB,CAAC,wBAAwB,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE;YACrE,MAAM,KAAK,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YAC3C,MAAM,IAAI,GAAG,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC9C,IAAI,CAAC,IAAI;gBAAE,OAAO;YAElB,MAAM,SAAS,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YAChD,SAAS,CAAC,WAAW,GAAG,IAAI,CAAC;YAC7B,MAAM,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,kBAAkB,CAAC,SAAsB,EAAE,QAAkB;QACpE,wEAAwE;QACxE,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YAC1D,+BAA+B;YAC/B,MAAM,MAAM,GAAG,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAChC,IAAI,CAAC,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAE,OAAO;YAEnD,0EAA0E;YAC1E,IAAI,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YACxC,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,OAAO,CAAC;YAE5E,wBAAwB;YACxB,IAAI,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC5B,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;YAC/C,CAAC;iBAAM,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC9B,GAAG,GAAG,GAAG,GAAG,aAAa,CAAC;YAC3B,CAAC;iBAAM,CAAC;gBACP,GAAG,GAAG,GAAG,GAAG,aAAa,CAAC;YAC3B,CAAC;YAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YAC/C,QAAQ,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;YAClC,QAAQ,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;YAElC,kCAAkC;YAClC,MAAM,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC9B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,mBAAmB,CAAC,SAAsB;QACjD,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YAC1D,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;YACpC,IAAI,CAAC,GAAG;gBAAE,OAAO;YAEjB,IAAI,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC5B,GAAG,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC,CAAC;YAClE,CAAC;iBAAM,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC9B,GAAG,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,GAAG,aAAa,CAAC,CAAC;YAC9C,CAAC;iBAAM,CAAC;gBACP,GAAG,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,GAAG,aAAa,CAAC,CAAC;YAC9C,CAAC;QACF,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,sBAAsB,CAAC,SAAsB,EAAE,QAAkB;QACxE,0FAA0F;QAC1F,MAAM,SAAS,GAAG,CAAC,CAAC;QACpB,MAAM,YAAY,GAAG,CAAC,CAAC;QAEvB,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YACpE,MAAM,CAAC,GAAG,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;YAEtC,mEAAmE;YACnE,MAAM,WAAW,GAAG,CAAC,IAAU,EAAQ,EAAE;gBACxC,IAAI,IAAI,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;oBACjC,CAAC,CAAC,WAAW,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC;gBAChE,CAAC;qBAAM,IAAI,IAAI,CAAC,QAAQ,KAAK,YAAY,EAAE,CAAC;oBAC3C,MAAM,EAAE,GAAG,IAAe,CAAC;oBAC3B,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;oBAErC,IAAI,GAAG,KAAK,QAAQ,EAAE,CAAC;wBACtB,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;wBAChD,MAAM,CAAC,WAAW,GAAG,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC;wBAC1C,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;oBACvB,CAAC;yBAAM,IAAI,GAAG,KAAK,GAAG,EAAE,CAAC;wBACxB,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;wBACzC,IAAI,CAAC,YAAY,CAAC,MAAM,EAAE,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;wBACzD,IAAI,CAAC,WAAW,GAAG,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC;wBACxC,CAAC,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;oBACrB,CAAC;yBAAM,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;wBAC3B,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;wBAC5C,IAAI,CAAC,WAAW,GAAG,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC;wBACxC,CAAC,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;oBACrB,CAAC;yBAAM,CAAC;wBACP,kDAAkD;wBAClD,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC;oBACpD,CAAC;gBACF,CAAC;YACF,CAAC,CAAC;YAEF,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC;YACpD,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,gBAAgB,CAAC,SAAsB,EAAE,QAAkB;QAClE,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;YAC/D,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;YAChD,MAAM,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;YAC5C,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,qBAAqB,CAAC,SAAsB;QACnD,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE;YACnE,EAAE,CAAC,eAAe,CAAC,iBAAiB,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,iBAAiB;QACxB,MAAM,IAAI,GAAG,IAAI,CAAC,gBAAgB,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC9D,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC9D,CAAC;CACD;AA/QD,8CA+QC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { BaseExtractor } from './_base';
|
|
2
|
+
import { ExtractorResult } from '../types/extractors';
|
|
3
|
+
export declare class XOembedExtractor extends BaseExtractor {
|
|
4
|
+
canExtract(): boolean;
|
|
5
|
+
extract(): ExtractorResult;
|
|
6
|
+
canExtractAsync(): boolean;
|
|
7
|
+
extractAsync(): Promise<ExtractorResult>;
|
|
8
|
+
private extractOembed;
|
|
9
|
+
private tryExtractFxTwitter;
|
|
10
|
+
private fetchFxTwitter;
|
|
11
|
+
private buildArticleResult;
|
|
12
|
+
private buildTweetResult;
|
|
13
|
+
private renderTweet;
|
|
14
|
+
private applyMarkers;
|
|
15
|
+
private applyFacets;
|
|
16
|
+
private renderArticle;
|
|
17
|
+
private renderBlock;
|
|
18
|
+
private renderAtomicBlock;
|
|
19
|
+
private renderInlineContent;
|
|
20
|
+
}
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.XOembedExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
class XOembedExtractor extends _base_1.BaseExtractor {
|
|
7
|
+
canExtract() {
|
|
8
|
+
return false;
|
|
9
|
+
}
|
|
10
|
+
extract() {
|
|
11
|
+
return {
|
|
12
|
+
content: '',
|
|
13
|
+
contentHtml: '',
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
canExtractAsync() {
|
|
17
|
+
return /\/(status|article)\/\d+/.test(this.url);
|
|
18
|
+
}
|
|
19
|
+
async extractAsync() {
|
|
20
|
+
// Try FxTwitter first — it has full tweet text and media
|
|
21
|
+
const fxResult = await this.tryExtractFxTwitter();
|
|
22
|
+
if (fxResult) {
|
|
23
|
+
return fxResult;
|
|
24
|
+
}
|
|
25
|
+
// Fall back to oEmbed (truncates long tweets but always available)
|
|
26
|
+
return this.extractOembed();
|
|
27
|
+
}
|
|
28
|
+
async extractOembed() {
|
|
29
|
+
const oembedUrl = `https://publish.twitter.com/oembed?url=${encodeURIComponent(this.url)}&omit_script=true`;
|
|
30
|
+
const response = await fetch(oembedUrl);
|
|
31
|
+
if (!response.ok) {
|
|
32
|
+
throw new Error(`oEmbed request failed: ${response.status}`);
|
|
33
|
+
}
|
|
34
|
+
const data = await response.json();
|
|
35
|
+
// Parse the oEmbed HTML to extract tweet text
|
|
36
|
+
const div = this.document.createElement('div');
|
|
37
|
+
div.appendChild((0, dom_1.parseHTML)(this.document, data.html));
|
|
38
|
+
// The oEmbed HTML contains a <blockquote> with <p> tags for text
|
|
39
|
+
// and an <a> tag for the date
|
|
40
|
+
const blockquote = div.querySelector('blockquote');
|
|
41
|
+
const paragraphs = blockquote?.querySelectorAll('p') || [];
|
|
42
|
+
const tweetText = Array.from(paragraphs)
|
|
43
|
+
.map(p => `<p>${(0, dom_1.serializeHTML)(p)}</p>`)
|
|
44
|
+
.join('\n');
|
|
45
|
+
const handle = data.author_url
|
|
46
|
+
? `@${data.author_url.split('/').pop()}`
|
|
47
|
+
: '';
|
|
48
|
+
const dateLink = blockquote?.querySelector('a:last-child');
|
|
49
|
+
const dateText = dateLink?.textContent?.trim() || '';
|
|
50
|
+
const permalink = dateLink?.getAttribute('href') || this.url;
|
|
51
|
+
const escapedAuthorName = (0, dom_1.escapeHtml)(data.author_name);
|
|
52
|
+
const escapedHandle = (0, dom_1.escapeHtml)(handle);
|
|
53
|
+
const escapedDateText = (0, dom_1.escapeHtml)(dateText);
|
|
54
|
+
const escapedPermalink = (0, dom_1.escapeHtml)(permalink);
|
|
55
|
+
const contentHtml = `
|
|
56
|
+
<div class="tweet-thread">
|
|
57
|
+
<div class="main-tweet">
|
|
58
|
+
<div class="tweet">
|
|
59
|
+
<div class="tweet-header">
|
|
60
|
+
<span class="tweet-author"><strong>${escapedAuthorName}</strong> <span class="tweet-handle">${escapedHandle}</span></span>
|
|
61
|
+
${dateText ? `<a href="${escapedPermalink}" class="tweet-date">${escapedDateText}</a>` : ''}
|
|
62
|
+
</div>
|
|
63
|
+
${tweetText ? `<div class="tweet-text">${tweetText}</div>` : ''}
|
|
64
|
+
</div>
|
|
65
|
+
</div>
|
|
66
|
+
</div>
|
|
67
|
+
`.trim();
|
|
68
|
+
return {
|
|
69
|
+
content: contentHtml,
|
|
70
|
+
contentHtml: contentHtml,
|
|
71
|
+
variables: {
|
|
72
|
+
title: `Post by ${handle || data.author_name}`,
|
|
73
|
+
author: handle || data.author_name,
|
|
74
|
+
site: 'X (Twitter)',
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
async tryExtractFxTwitter() {
|
|
79
|
+
const match = this.url.match(/\/([a-zA-Z][a-zA-Z0-9_]{0,14})\/(status|article)\/(\d+)/);
|
|
80
|
+
if (!match)
|
|
81
|
+
return null;
|
|
82
|
+
try {
|
|
83
|
+
const data = await this.fetchFxTwitter(match[1], match[3]);
|
|
84
|
+
// If it's an article, use the rich article renderer
|
|
85
|
+
if (data.tweet?.article) {
|
|
86
|
+
return this.buildArticleResult(data);
|
|
87
|
+
}
|
|
88
|
+
// Otherwise use the full tweet text from FxTwitter
|
|
89
|
+
if (data.tweet?.text) {
|
|
90
|
+
return this.buildTweetResult(data);
|
|
91
|
+
}
|
|
92
|
+
return null;
|
|
93
|
+
}
|
|
94
|
+
catch {
|
|
95
|
+
return null;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
async fetchFxTwitter(username, id) {
|
|
99
|
+
const apiUrl = `https://api.fxtwitter.com/${username}/status/${id}`;
|
|
100
|
+
const response = await fetch(apiUrl, {
|
|
101
|
+
headers: {
|
|
102
|
+
'User-Agent': 'Mozilla/5.0 (compatible; Defuddle/1.0; +https://defuddle.md)',
|
|
103
|
+
},
|
|
104
|
+
});
|
|
105
|
+
if (!response.ok) {
|
|
106
|
+
throw new Error(`FxTwitter API request failed: ${response.status}`);
|
|
107
|
+
}
|
|
108
|
+
return response.json();
|
|
109
|
+
}
|
|
110
|
+
buildArticleResult(data) {
|
|
111
|
+
const article = data.tweet.article;
|
|
112
|
+
const { blocks, entityMap } = article.content;
|
|
113
|
+
const contentHtml = this.renderArticle(blocks, entityMap, article.cover_media);
|
|
114
|
+
const handle = `@${data.tweet.author.screen_name}`;
|
|
115
|
+
return {
|
|
116
|
+
content: contentHtml,
|
|
117
|
+
contentHtml,
|
|
118
|
+
variables: {
|
|
119
|
+
title: article.title,
|
|
120
|
+
author: handle,
|
|
121
|
+
site: 'X (Twitter)',
|
|
122
|
+
description: article.preview_text,
|
|
123
|
+
}
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
buildTweetResult(data) {
|
|
127
|
+
const tweet = data.tweet;
|
|
128
|
+
const handle = `@${tweet.author.screen_name}`;
|
|
129
|
+
const contentHtml = this.renderTweet(tweet);
|
|
130
|
+
return {
|
|
131
|
+
content: contentHtml,
|
|
132
|
+
contentHtml,
|
|
133
|
+
variables: {
|
|
134
|
+
title: `Post by ${handle}`,
|
|
135
|
+
author: handle,
|
|
136
|
+
site: 'X (Twitter)',
|
|
137
|
+
}
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
renderTweet(tweet) {
|
|
141
|
+
const text = tweet.raw_text?.text || tweet.text;
|
|
142
|
+
// Filter out media facets — FxTwitter already strips pic.twitter.com
|
|
143
|
+
// links from the text, so media facet indices are stale
|
|
144
|
+
const facets = (tweet.raw_text?.facets || []).filter(f => f.type !== 'media');
|
|
145
|
+
// Split text into paragraphs on double newlines
|
|
146
|
+
const paragraphs = text.split(/\n\n+/);
|
|
147
|
+
let offset = 0;
|
|
148
|
+
const htmlParts = [];
|
|
149
|
+
for (const para of paragraphs) {
|
|
150
|
+
const paraStart = text.indexOf(para, offset);
|
|
151
|
+
const paraEnd = paraStart + para.length;
|
|
152
|
+
offset = paraEnd;
|
|
153
|
+
// Check if this paragraph is a blockquote (starts with >)
|
|
154
|
+
const isBlockquote = para.trimStart().startsWith('>');
|
|
155
|
+
let paraText = isBlockquote ? para.trimStart().slice(1).trimStart() : para;
|
|
156
|
+
const paraTextStart = isBlockquote
|
|
157
|
+
? paraStart + (para.length - para.trimStart().length) + 1 + (para.trimStart().slice(1).length - para.trimStart().slice(1).trimStart().length)
|
|
158
|
+
: paraStart;
|
|
159
|
+
// Apply facets within this paragraph
|
|
160
|
+
const rendered = this.applyFacets(paraText, paraTextStart, paraEnd, facets);
|
|
161
|
+
// Handle line breaks within paragraph
|
|
162
|
+
const withBreaks = rendered.replace(/\n/g, '<br>');
|
|
163
|
+
if (isBlockquote) {
|
|
164
|
+
htmlParts.push(`<blockquote><p>${withBreaks}</p></blockquote>`);
|
|
165
|
+
}
|
|
166
|
+
else if (withBreaks.trim()) {
|
|
167
|
+
htmlParts.push(`<p>${withBreaks}</p>`);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
// Append media images
|
|
171
|
+
if (tweet.media?.photos) {
|
|
172
|
+
for (const photo of tweet.media.photos) {
|
|
173
|
+
htmlParts.push(`<img src="${(0, dom_1.escapeHtml)(photo.url)}" alt="">`);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
const handle = (0, dom_1.escapeHtml)(`@${tweet.author.screen_name}`);
|
|
177
|
+
const authorName = (0, dom_1.escapeHtml)(tweet.author.name);
|
|
178
|
+
return `<div class="tweet-thread"><div class="main-tweet"><div class="tweet">` +
|
|
179
|
+
`<div class="tweet-header"><span class="tweet-author"><strong>${authorName}</strong> <span class="tweet-handle">${handle}</span></span></div>` +
|
|
180
|
+
`<div class="tweet-text">${htmlParts.join('\n')}</div>` +
|
|
181
|
+
`</div></div></div>`;
|
|
182
|
+
}
|
|
183
|
+
applyMarkers(text, markers) {
|
|
184
|
+
if (markers.length === 0) {
|
|
185
|
+
return (0, dom_1.escapeHtml)(text);
|
|
186
|
+
}
|
|
187
|
+
markers.sort((a, b) => {
|
|
188
|
+
if (a.offset !== b.offset)
|
|
189
|
+
return a.offset - b.offset;
|
|
190
|
+
if (a.type === 'close' && b.type === 'open')
|
|
191
|
+
return -1;
|
|
192
|
+
if (a.type === 'open' && b.type === 'close')
|
|
193
|
+
return 1;
|
|
194
|
+
return 0;
|
|
195
|
+
});
|
|
196
|
+
let result = '';
|
|
197
|
+
let pos = 0;
|
|
198
|
+
for (const marker of markers) {
|
|
199
|
+
if (marker.offset > pos) {
|
|
200
|
+
result += (0, dom_1.escapeHtml)(text.slice(pos, marker.offset));
|
|
201
|
+
}
|
|
202
|
+
result += marker.tag;
|
|
203
|
+
pos = marker.offset;
|
|
204
|
+
}
|
|
205
|
+
if (pos < text.length) {
|
|
206
|
+
result += (0, dom_1.escapeHtml)(text.slice(pos));
|
|
207
|
+
}
|
|
208
|
+
return result;
|
|
209
|
+
}
|
|
210
|
+
applyFacets(text, textStart, textEnd, facets) {
|
|
211
|
+
const markers = [];
|
|
212
|
+
for (const facet of facets) {
|
|
213
|
+
const [fStart, fEnd] = facet.indices;
|
|
214
|
+
if (fEnd <= textStart || fStart >= textEnd)
|
|
215
|
+
continue;
|
|
216
|
+
const relStart = Math.max(0, fStart - textStart);
|
|
217
|
+
const relEnd = Math.min(text.length, fEnd - textStart);
|
|
218
|
+
if (facet.type === 'italic') {
|
|
219
|
+
markers.push({ offset: relStart, type: 'open', tag: '<em>' });
|
|
220
|
+
markers.push({ offset: relEnd, type: 'close', tag: '</em>' });
|
|
221
|
+
}
|
|
222
|
+
else if (facet.type === 'mention' && facet.text) {
|
|
223
|
+
const url = `https://x.com/${(0, dom_1.escapeHtml)(facet.text)}`;
|
|
224
|
+
markers.push({ offset: relStart, type: 'open', tag: `<a href="${url}">` });
|
|
225
|
+
markers.push({ offset: relEnd, type: 'close', tag: '</a>' });
|
|
226
|
+
}
|
|
227
|
+
else if (facet.type === 'url' && facet.original) {
|
|
228
|
+
const url = (0, dom_1.escapeHtml)(facet.original);
|
|
229
|
+
markers.push({ offset: relStart, type: 'open', tag: `<a href="${url}">` });
|
|
230
|
+
markers.push({ offset: relEnd, type: 'close', tag: '</a>' });
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
return this.applyMarkers(text, markers);
|
|
234
|
+
}
|
|
235
|
+
renderArticle(blocks, entityMap, coverMedia) {
|
|
236
|
+
const parts = [];
|
|
237
|
+
// Add cover image if available
|
|
238
|
+
if (coverMedia?.media_info?.original_img_url) {
|
|
239
|
+
parts.push(`<img src="${(0, dom_1.escapeHtml)(coverMedia.media_info.original_img_url)}" alt="Cover image">`);
|
|
240
|
+
}
|
|
241
|
+
let i = 0;
|
|
242
|
+
while (i < blocks.length) {
|
|
243
|
+
const block = blocks[i];
|
|
244
|
+
if (block.type === 'unordered-list-item') {
|
|
245
|
+
// Group consecutive list items into a <ul>
|
|
246
|
+
const items = [];
|
|
247
|
+
while (i < blocks.length && blocks[i].type === 'unordered-list-item') {
|
|
248
|
+
items.push(`<li>${this.renderInlineContent(blocks[i], entityMap)}</li>`);
|
|
249
|
+
i++;
|
|
250
|
+
}
|
|
251
|
+
parts.push(`<ul>${items.join('')}</ul>`);
|
|
252
|
+
continue;
|
|
253
|
+
}
|
|
254
|
+
const html = this.renderBlock(block, entityMap);
|
|
255
|
+
if (html) {
|
|
256
|
+
parts.push(html);
|
|
257
|
+
}
|
|
258
|
+
i++;
|
|
259
|
+
}
|
|
260
|
+
return `<article class="x-article">${parts.join('')}</article>`;
|
|
261
|
+
}
|
|
262
|
+
renderBlock(block, entityMap) {
|
|
263
|
+
switch (block.type) {
|
|
264
|
+
case 'unstyled': {
|
|
265
|
+
if (!block.text.trim())
|
|
266
|
+
return '';
|
|
267
|
+
return `<p>${this.renderInlineContent(block, entityMap)}</p>`;
|
|
268
|
+
}
|
|
269
|
+
case 'header-two':
|
|
270
|
+
return `<h2>${this.renderInlineContent(block, entityMap)}</h2>`;
|
|
271
|
+
case 'header-three':
|
|
272
|
+
return `<h3>${this.renderInlineContent(block, entityMap)}</h3>`;
|
|
273
|
+
case 'atomic':
|
|
274
|
+
return this.renderAtomicBlock(block, entityMap);
|
|
275
|
+
default: {
|
|
276
|
+
if (!block.text.trim())
|
|
277
|
+
return '';
|
|
278
|
+
return `<p>${this.renderInlineContent(block, entityMap)}</p>`;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
renderAtomicBlock(block, entityMap) {
|
|
283
|
+
if (block.entityRanges.length === 0)
|
|
284
|
+
return '';
|
|
285
|
+
const entityEntry = entityMap.find(e => e.key === String(block.entityRanges[0].key));
|
|
286
|
+
if (!entityEntry)
|
|
287
|
+
return '';
|
|
288
|
+
const entity = entityEntry.value;
|
|
289
|
+
switch (entity.type) {
|
|
290
|
+
case 'MEDIA': {
|
|
291
|
+
const caption = entity.data.caption;
|
|
292
|
+
if (caption) {
|
|
293
|
+
return `<figure><figcaption>${(0, dom_1.escapeHtml)(caption)}</figcaption></figure>`;
|
|
294
|
+
}
|
|
295
|
+
return '';
|
|
296
|
+
}
|
|
297
|
+
case 'MARKDOWN': {
|
|
298
|
+
const markdown = entity.data.markdown || '';
|
|
299
|
+
// Strip the wrapping ```...``` fences
|
|
300
|
+
const codeMatch = markdown.match(/^```(\w*)\n([\s\S]*?)\n?```$/);
|
|
301
|
+
if (codeMatch) {
|
|
302
|
+
const lang = codeMatch[1];
|
|
303
|
+
const code = codeMatch[2];
|
|
304
|
+
const langAttr = lang ? ` class="language-${(0, dom_1.escapeHtml)(lang)}" data-lang="${(0, dom_1.escapeHtml)(lang)}"` : '';
|
|
305
|
+
return `<pre><code${langAttr}>${(0, dom_1.escapeHtml)(code)}</code></pre>`;
|
|
306
|
+
}
|
|
307
|
+
return `<pre><code>${(0, dom_1.escapeHtml)(markdown)}</code></pre>`;
|
|
308
|
+
}
|
|
309
|
+
default:
|
|
310
|
+
return '';
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
renderInlineContent(block, entityMap) {
|
|
314
|
+
const text = block.text;
|
|
315
|
+
if (!text)
|
|
316
|
+
return '';
|
|
317
|
+
const markers = [];
|
|
318
|
+
for (const range of block.inlineStyleRanges) {
|
|
319
|
+
if (range.style === 'Bold') {
|
|
320
|
+
markers.push({ offset: range.offset, type: 'open', tag: '<strong>' });
|
|
321
|
+
markers.push({ offset: range.offset + range.length, type: 'close', tag: '</strong>' });
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
for (const range of block.entityRanges) {
|
|
325
|
+
const entityEntry = entityMap.find(e => e.key === String(range.key));
|
|
326
|
+
if (entityEntry?.value.type === 'LINK' && entityEntry.value.data.url) {
|
|
327
|
+
const url = (0, dom_1.escapeHtml)(entityEntry.value.data.url);
|
|
328
|
+
markers.push({ offset: range.offset, type: 'open', tag: `<a href="${url}">` });
|
|
329
|
+
markers.push({ offset: range.offset + range.length, type: 'close', tag: '</a>' });
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
if (block.data?.mentions) {
|
|
333
|
+
for (const mention of block.data.mentions) {
|
|
334
|
+
const url = `https://x.com/${(0, dom_1.escapeHtml)(mention.text)}`;
|
|
335
|
+
markers.push({ offset: mention.fromIndex, type: 'open', tag: `<a href="${url}">` });
|
|
336
|
+
markers.push({ offset: mention.toIndex, type: 'close', tag: '</a>' });
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
if (block.data?.urls) {
|
|
340
|
+
for (const urlData of block.data.urls) {
|
|
341
|
+
const url = (0, dom_1.escapeHtml)(urlData.text);
|
|
342
|
+
markers.push({ offset: urlData.fromIndex, type: 'open', tag: `<a href="${url}">` });
|
|
343
|
+
markers.push({ offset: urlData.toIndex, type: 'close', tag: '</a>' });
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
return this.applyMarkers(text, markers);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
exports.XOembedExtractor = XOembedExtractor;
|
|
350
|
+
//# sourceMappingURL=x-oembed.js.map
|