@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
@@ -0,0 +1,267 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.XArticleExtractor = void 0;
4
+ const _base_1 = require("./_base");
5
+ const dom_1 = require("../utils/dom");
6
+ const SELECTORS = {
7
+ ARTICLE_CONTAINER: '[data-testid="twitterArticleRichTextView"]',
8
+ TITLE: '[data-testid="twitter-article-title"]',
9
+ AUTHOR: '[itemprop="author"]',
10
+ AUTHOR_NAME: 'meta[itemprop="name"]',
11
+ AUTHOR_HANDLE: 'meta[itemprop="additionalName"]',
12
+ IMAGES: '[data-testid="tweetPhoto"] img',
13
+ DRAFT_PARAGRAPHS: '.longform-unstyled, .public-DraftStyleDefault-block',
14
+ BOLD_SPANS: 'span[style*="font-weight: bold"]',
15
+ DRAFT_ATTRIBUTES: '[data-offset-key]',
16
+ EMBEDDED_TWEET: '[data-testid="simpleTweet"]',
17
+ TWEET_TEXT: '[data-testid="tweetText"]',
18
+ USER_NAME: '[data-testid="User-Name"]',
19
+ CODE_BLOCK: '[data-testid="markdown-code-block"]',
20
+ HEADER_BLOCK: '[data-testid="longform-header"]',
21
+ };
22
+ class XArticleExtractor extends _base_1.BaseExtractor {
23
+ constructor(document, url, schemaOrgData) {
24
+ super(document, url, schemaOrgData);
25
+ this.articleContainer = document.querySelector(SELECTORS.ARTICLE_CONTAINER);
26
+ }
27
+ canExtract() {
28
+ return !!this.articleContainer;
29
+ }
30
+ extract() {
31
+ const title = this.extractTitle();
32
+ const author = this.extractAuthor();
33
+ const contentHtml = this.extractContent();
34
+ const description = this.createDescription();
35
+ return {
36
+ content: contentHtml,
37
+ contentHtml,
38
+ extractedContent: {
39
+ articleId: this.getArticleId(),
40
+ },
41
+ variables: {
42
+ title,
43
+ author,
44
+ site: 'X (Twitter)',
45
+ description,
46
+ }
47
+ };
48
+ }
49
+ extractTitle() {
50
+ const titleEl = this.document.querySelector(SELECTORS.TITLE);
51
+ return titleEl?.textContent?.trim() || 'Untitled X Article';
52
+ }
53
+ extractAuthor() {
54
+ const authorContainer = this.document.querySelector(SELECTORS.AUTHOR);
55
+ if (!authorContainer)
56
+ return this.getAuthorFromUrl();
57
+ const name = authorContainer.querySelector(SELECTORS.AUTHOR_NAME)?.getAttribute('content');
58
+ const handle = authorContainer.querySelector(SELECTORS.AUTHOR_HANDLE)?.getAttribute('content');
59
+ if (name && handle)
60
+ return `${name} (@${handle})`;
61
+ return name || handle || this.getAuthorFromUrl();
62
+ }
63
+ getAuthorFromUrl() {
64
+ // match username before /article/, excluding system paths like /i/
65
+ const match = this.url.match(/\/([a-zA-Z][a-zA-Z0-9_]{0,14})\/article\/\d+/);
66
+ return match ? `@${match[1]}` : this.getAuthorFromOgTitle();
67
+ }
68
+ getAuthorFromOgTitle() {
69
+ const ogTitle = this.document.querySelector('meta[property="og:title"]')?.getAttribute('content') || '';
70
+ // Match patterns like "(4) Heinrich on X: ..." or "Heinrich on X: ..."
71
+ const match = ogTitle.match(/^(?:\(\d+\)\s+)?(.+?)\s+on\s+X\s*:/);
72
+ return match ? match[1].trim() : 'Unknown';
73
+ }
74
+ getArticleId() {
75
+ const match = this.url.match(/article\/(\d+)/);
76
+ return match ? match[1] : '';
77
+ }
78
+ extractContent() {
79
+ if (!this.articleContainer)
80
+ return '';
81
+ const clone = this.articleContainer.cloneNode(true);
82
+ this.cleanContent(clone);
83
+ return `<article class="x-article">${(0, dom_1.serializeHTML)(clone)}</article>`;
84
+ }
85
+ cleanContent(container) {
86
+ const ownerDoc = container.ownerDocument || this.document;
87
+ // convert complex elements first (before other transformations)
88
+ this.convertEmbeddedTweets(container, ownerDoc);
89
+ this.convertCodeBlocks(container, ownerDoc);
90
+ this.convertHeaders(container, ownerDoc);
91
+ this.unwrapLinkedImages(container, ownerDoc);
92
+ this.upgradeImageQuality(container);
93
+ // convert bold spans BEFORE paragraphs so formatting is preserved
94
+ this.convertBoldSpans(container, ownerDoc);
95
+ this.convertDraftParagraphs(container, ownerDoc);
96
+ this.removeDraftAttributes(container);
97
+ }
98
+ convertEmbeddedTweets(container, ownerDoc) {
99
+ container.querySelectorAll(SELECTORS.EMBEDDED_TWEET).forEach(tweet => {
100
+ const blockquote = ownerDoc.createElement('blockquote');
101
+ blockquote.className = 'embedded-tweet';
102
+ // extract author info
103
+ const userNameEl = tweet.querySelector(SELECTORS.USER_NAME);
104
+ const authorLinks = userNameEl?.querySelectorAll('a');
105
+ const fullName = authorLinks?.[0]?.textContent?.trim() || '';
106
+ const handle = authorLinks?.[1]?.textContent?.trim() || '';
107
+ // extract tweet text
108
+ const tweetTextEl = tweet.querySelector(SELECTORS.TWEET_TEXT);
109
+ const tweetText = tweetTextEl?.textContent?.trim() || '';
110
+ // build clean blockquote content
111
+ if (fullName || handle) {
112
+ const cite = ownerDoc.createElement('cite');
113
+ cite.textContent = handle ? `${fullName} ${handle}` : fullName;
114
+ blockquote.appendChild(cite);
115
+ }
116
+ if (tweetText) {
117
+ const p = ownerDoc.createElement('p');
118
+ p.textContent = tweetText;
119
+ blockquote.appendChild(p);
120
+ }
121
+ tweet.replaceWith(blockquote);
122
+ });
123
+ }
124
+ convertCodeBlocks(container, ownerDoc) {
125
+ container.querySelectorAll(SELECTORS.CODE_BLOCK).forEach(block => {
126
+ const pre = block.querySelector('pre');
127
+ const code = block.querySelector('code');
128
+ if (!pre || !code)
129
+ return;
130
+ // extract language from class (e.g., "language-bash") or from span
131
+ let language = '';
132
+ const langClass = code.className.match(/language-(\w+)/);
133
+ if (langClass) {
134
+ language = langClass[1];
135
+ }
136
+ else {
137
+ // fallback: look for language label in the block header
138
+ const langSpan = block.querySelector('span');
139
+ language = langSpan?.textContent?.trim() || '';
140
+ }
141
+ // create clean pre/code structure
142
+ const newPre = ownerDoc.createElement('pre');
143
+ const newCode = ownerDoc.createElement('code');
144
+ if (language) {
145
+ newCode.setAttribute('data-lang', language);
146
+ newCode.className = `language-${language}`;
147
+ }
148
+ newCode.textContent = code.textContent || '';
149
+ newPre.appendChild(newCode);
150
+ // replace the entire block container
151
+ block.replaceWith(newPre);
152
+ });
153
+ }
154
+ convertHeaders(container, ownerDoc) {
155
+ // X articles use h2/h3 elements but content may be nested in spans/divs
156
+ container.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach(header => {
157
+ const level = header.tagName.toLowerCase();
158
+ const text = header.textContent?.trim() || '';
159
+ if (!text)
160
+ return;
161
+ const newHeader = ownerDoc.createElement(level);
162
+ newHeader.textContent = text;
163
+ header.replaceWith(newHeader);
164
+ });
165
+ }
166
+ unwrapLinkedImages(container, ownerDoc) {
167
+ // find all tweetPhoto images and extract them from any ancestor anchors
168
+ container.querySelectorAll(SELECTORS.IMAGES).forEach(img => {
169
+ // find closest anchor ancestor
170
+ const anchor = img.closest('a');
171
+ if (!anchor || !container.contains(anchor))
172
+ return;
173
+ // create clean img tag with upgraded quality (like TwitterExtractor does)
174
+ let src = img.getAttribute('src') || '';
175
+ const alt = img.getAttribute('alt')?.replace(/\s+/g, ' ').trim() || 'Image';
176
+ // upgrade image quality
177
+ if (src.includes('&name=')) {
178
+ src = src.replace(/&name=\w+/, '&name=large');
179
+ }
180
+ else if (src.includes('?')) {
181
+ src = `${src}&name=large`;
182
+ }
183
+ else {
184
+ src = `${src}?name=large`;
185
+ }
186
+ const cleanImg = ownerDoc.createElement('img');
187
+ cleanImg.setAttribute('src', src);
188
+ cleanImg.setAttribute('alt', alt);
189
+ // replace anchor with clean image
190
+ anchor.replaceWith(cleanImg);
191
+ });
192
+ }
193
+ upgradeImageQuality(container) {
194
+ container.querySelectorAll(SELECTORS.IMAGES).forEach(img => {
195
+ const src = img.getAttribute('src');
196
+ if (!src)
197
+ return;
198
+ if (src.includes('&name=')) {
199
+ img.setAttribute('src', src.replace(/&name=\w+/, '&name=large'));
200
+ }
201
+ else if (src.includes('?')) {
202
+ img.setAttribute('src', `${src}&name=large`);
203
+ }
204
+ else {
205
+ img.setAttribute('src', `${src}?name=large`);
206
+ }
207
+ });
208
+ }
209
+ convertDraftParagraphs(container, ownerDoc) {
210
+ // node type constants (avoid using Node global which isn't available in all environments)
211
+ const TEXT_NODE = 3;
212
+ const ELEMENT_NODE = 1;
213
+ container.querySelectorAll(SELECTORS.DRAFT_PARAGRAPHS).forEach(div => {
214
+ const p = ownerDoc.createElement('p');
215
+ // preserve formatting (strong, links, code) by processing children
216
+ const processNode = (node) => {
217
+ if (node.nodeType === TEXT_NODE) {
218
+ p.appendChild(ownerDoc.createTextNode(node.textContent || ''));
219
+ }
220
+ else if (node.nodeType === ELEMENT_NODE) {
221
+ const el = node;
222
+ const tag = el.tagName.toLowerCase();
223
+ if (tag === 'strong') {
224
+ const strong = ownerDoc.createElement('strong');
225
+ strong.textContent = el.textContent || '';
226
+ p.appendChild(strong);
227
+ }
228
+ else if (tag === 'a') {
229
+ const link = ownerDoc.createElement('a');
230
+ link.setAttribute('href', el.getAttribute('href') || '');
231
+ link.textContent = el.textContent || '';
232
+ p.appendChild(link);
233
+ }
234
+ else if (tag === 'code') {
235
+ const code = ownerDoc.createElement('code');
236
+ code.textContent = el.textContent || '';
237
+ p.appendChild(code);
238
+ }
239
+ else {
240
+ // recurse into other elements (spans, divs, etc.)
241
+ el.childNodes.forEach(child => processNode(child));
242
+ }
243
+ }
244
+ };
245
+ div.childNodes.forEach(child => processNode(child));
246
+ div.replaceWith(p);
247
+ });
248
+ }
249
+ convertBoldSpans(container, ownerDoc) {
250
+ container.querySelectorAll(SELECTORS.BOLD_SPANS).forEach(span => {
251
+ const strong = ownerDoc.createElement('strong');
252
+ strong.textContent = span.textContent || '';
253
+ span.replaceWith(strong);
254
+ });
255
+ }
256
+ removeDraftAttributes(container) {
257
+ container.querySelectorAll(SELECTORS.DRAFT_ATTRIBUTES).forEach(el => {
258
+ el.removeAttribute('data-offset-key');
259
+ });
260
+ }
261
+ createDescription() {
262
+ const text = this.articleContainer?.textContent?.trim() || '';
263
+ return text.slice(0, 140) + (text.length > 140 ? '...' : '');
264
+ }
265
+ }
266
+ exports.XArticleExtractor = XArticleExtractor;
267
+ //# sourceMappingURL=x-article.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"x-article.js","sourceRoot":"","sources":["../../src/extractors/x-article.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,sCAA6C;AAE7C,MAAM,SAAS,GAAG;IACjB,iBAAiB,EAAE,4CAA4C;IAC/D,KAAK,EAAE,uCAAuC;IAC9C,MAAM,EAAE,qBAAqB;IAC7B,WAAW,EAAE,uBAAuB;IACpC,aAAa,EAAE,iCAAiC;IAChD,MAAM,EAAE,gCAAgC;IACxC,gBAAgB,EAAE,qDAAqD;IACvE,UAAU,EAAE,kCAAkC;IAC9C,gBAAgB,EAAE,mBAAmB;IACrC,cAAc,EAAE,6BAA6B;IAC7C,UAAU,EAAE,2BAA2B;IACvC,SAAS,EAAE,2BAA2B;IACtC,UAAU,EAAE,qCAAqC;IACjD,YAAY,EAAE,iCAAiC;CACtC,CAAC;AAEX,MAAa,iBAAkB,SAAQ,qBAAa;IAGnD,YAAY,QAAkB,EAAE,GAAW,EAAE,aAAmB;QAC/D,KAAK,CAAC,QAAQ,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;QACpC,IAAI,CAAC,gBAAgB,GAAG,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAC7E,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC;IAChC,CAAC;IAED,OAAO;QACN,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QACpC,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAE7C,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW;YACX,gBAAgB,EAAE;gBACjB,SAAS,EAAE,IAAI,CAAC,YAAY,EAAE;aAC9B;YACD,SAAS,EAAE;gBACV,KAAK;gBACL,MAAM;gBACN,IAAI,EAAE,aAAa;gBACnB,WAAW;aACX;SACD,CAAC;IACH,CAAC;IAEO,YAAY;QACnB,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAC7D,OAAO,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,oBAAoB,CAAC;IAC7D,CAAC;IAEO,aAAa;QACpB,MAAM,eAAe,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACtE,IAAI,CAAC,eAAe;YAAE,OAAO,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAErD,MAAM,IAAI,GAAG,eAAe,CAAC,aAAa,CAAC,SAAS,CAAC,WAAW,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QAC3F,MAAM,MAAM,GAAG,eAAe,CAAC,aAAa,CAAC,SAAS,CAAC,aAAa,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QAE/F,IAAI,IAAI,IAAI,MAAM;YAAE,OAAO,GAAG,IAAI,MAAM,MAAM,GAAG,CAAC;QAClD,OAAO,IAAI,IAAI,MAAM,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;IAClD,CAAC;IAEO,gBAAgB;QACvB,mEAAmE;QACnE,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,8CAA8C,CAAC,CAAC;QAC7E,OAAO,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC;IAC7D,CAAC;IAEO,oBAAoB;QAC3B,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,2BAA2B,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QACxG,uEAAuE;QACvE,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,oCAAoC,CAAC,CAAC;QAClE,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAC5C,CAAC;IAEO,YAAY;QACnB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;QAC/C,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAC9B,CAAC;IAEO,cAAc;QACrB,IAAI,CAAC,IAAI,CAAC,gBAAgB;YAAE,OAAO,EAAE,CAAC;QAEtC,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,IAAI,CAAgB,CAAC;QACnE,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QAEzB,OAAO,8BAA8B,IAAA,mBAAa,EAAC,KAAK,CAAC,YAAY,CAAC;IACvE,CAAC;IAEO,YAAY,CAAC,SAAsB;QAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,aAAa,IAAI,IAAI,CAAC,QAAQ,CAAC;QAE1D,gEAAgE;QAChE,IAAI,CAAC,qBAAqB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAChD,IAAI,CAAC,iBAAiB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAC5C,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QACzC,IAAI,CAAC,kBAAkB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAC7C,IAAI,CAAC,mBAAmB,CAAC,SAAS,CAAC,CAAC;QACpC,kEAAkE;QAClE,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QAC3C,IAAI,CAAC,sBAAsB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;QACjD,IAAI,CAAC,qBAAqB,CAAC,SAAS,CAAC,CAAC;IACvC,CAAC;IAEO,qBAAqB,CAAC,SAAsB,EAAE,QAAkB;QACvE,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YACpE,MAAM,UAAU,GAAG,QAAQ,CAAC,aAAa,CAAC,YAAY,CAAC,CAAC;YACxD,UAAU,CAAC,SAAS,GAAG,gBAAgB,CAAC;YAExC,sBAAsB;YACtB,MAAM,UAAU,GAAG,KAAK,CAAC,aAAa,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC;YAC5D,MAAM,WAAW,GAAG,UAAU,EAAE,gBAAgB,CAAC,GAAG,CAAC,CAAC;YACtD,MAAM,QAAQ,GAAG,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC7D,MAAM,MAAM,GAAG,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAE3D,qBAAqB;YACrB,MAAM,WAAW,GAAG,KAAK,CAAC,aAAa,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC;YAC9D,MAAM,SAAS,GAAG,WAAW,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAEzD,iCAAiC;YACjC,IAAI,QAAQ,IAAI,MAAM,EAAE,CAAC;gBACxB,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;gBAC5C,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,CAAC,CAAC,GAAG,QAAQ,IAAI,MAAM,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;gBAC/D,UAAU,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;YAC9B,CAAC;YAED,IAAI,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,GAAG,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;gBACtC,CAAC,CAAC,WAAW,GAAG,SAAS,CAAC;gBAC1B,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;YAC3B,CAAC;YAED,KAAK,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,iBAAiB,CAAC,SAAsB,EAAE,QAAkB;QACnE,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;YAChE,MAAM,GAAG,GAAG,KAAK,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YACvC,MAAM,IAAI,GAAG,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YACzC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI;gBAAE,OAAO;YAE1B,mEAAmE;YACnE,IAAI,QAAQ,GAAG,EAAE,CAAC;YAClB,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;YACzD,IAAI,SAAS,EAAE,CAAC;gBACf,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YACzB,CAAC;iBAAM,CAAC;gBACP,wDAAwD;gBACxD,MAAM,QAAQ,GAAG,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;gBAC7C,QAAQ,GAAG,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAChD,CAAC;YAED,kCAAkC;YAClC,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC/C,IAAI,QAAQ,EAAE,CAAC;gBACd,OAAO,CAAC,YAAY,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;gBAC5C,OAAO,CAAC,SAAS,GAAG,YAAY,QAAQ,EAAE,CAAC;YAC5C,CAAC;YACD,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;YAC7C,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;YAE5B,qCAAqC;YACrC,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAC3B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,cAAc,CAAC,SAAsB,EAAE,QAAkB;QAChE,wEAAwE;QACxE,SAAS,CAAC,gBAAgB,CAAC,wBAAwB,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE;YACrE,MAAM,KAAK,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YAC3C,MAAM,IAAI,GAAG,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC9C,IAAI,CAAC,IAAI;gBAAE,OAAO;YAElB,MAAM,SAAS,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YAChD,SAAS,CAAC,WAAW,GAAG,IAAI,CAAC;YAC7B,MAAM,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,kBAAkB,CAAC,SAAsB,EAAE,QAAkB;QACpE,wEAAwE;QACxE,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YAC1D,+BAA+B;YAC/B,MAAM,MAAM,GAAG,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAChC,IAAI,CAAC,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAE,OAAO;YAEnD,0EAA0E;YAC1E,IAAI,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YACxC,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,OAAO,CAAC;YAE5E,wBAAwB;YACxB,IAAI,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC5B,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;YAC/C,CAAC;iBAAM,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC9B,GAAG,GAAG,GAAG,GAAG,aAAa,CAAC;YAC3B,CAAC;iBAAM,CAAC;gBACP,GAAG,GAAG,GAAG,GAAG,aAAa,CAAC;YAC3B,CAAC;YAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YAC/C,QAAQ,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;YAClC,QAAQ,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;YAElC,kCAAkC;YAClC,MAAM,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC9B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,mBAAmB,CAAC,SAAsB;QACjD,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YAC1D,MAAM,GAAG,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;YACpC,IAAI,CAAC,GAAG;gBAAE,OAAO;YAEjB,IAAI,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC5B,GAAG,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC,CAAC;YAClE,CAAC;iBAAM,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC9B,GAAG,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,GAAG,aAAa,CAAC,CAAC;YAC9C,CAAC;iBAAM,CAAC;gBACP,GAAG,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,GAAG,aAAa,CAAC,CAAC;YAC9C,CAAC;QACF,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,sBAAsB,CAAC,SAAsB,EAAE,QAAkB;QACxE,0FAA0F;QAC1F,MAAM,SAAS,GAAG,CAAC,CAAC;QACpB,MAAM,YAAY,GAAG,CAAC,CAAC;QAEvB,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YACpE,MAAM,CAAC,GAAG,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;YAEtC,mEAAmE;YACnE,MAAM,WAAW,GAAG,CAAC,IAAU,EAAQ,EAAE;gBACxC,IAAI,IAAI,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;oBACjC,CAAC,CAAC,WAAW,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC;gBAChE,CAAC;qBAAM,IAAI,IAAI,CAAC,QAAQ,KAAK,YAAY,EAAE,CAAC;oBAC3C,MAAM,EAAE,GAAG,IAAe,CAAC;oBAC3B,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;oBAErC,IAAI,GAAG,KAAK,QAAQ,EAAE,CAAC;wBACtB,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;wBAChD,MAAM,CAAC,WAAW,GAAG,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC;wBAC1C,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;oBACvB,CAAC;yBAAM,IAAI,GAAG,KAAK,GAAG,EAAE,CAAC;wBACxB,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;wBACzC,IAAI,CAAC,YAAY,CAAC,MAAM,EAAE,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;wBACzD,IAAI,CAAC,WAAW,GAAG,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC;wBACxC,CAAC,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;oBACrB,CAAC;yBAAM,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;wBAC3B,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;wBAC5C,IAAI,CAAC,WAAW,GAAG,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC;wBACxC,CAAC,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;oBACrB,CAAC;yBAAM,CAAC;wBACP,kDAAkD;wBAClD,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC;oBACpD,CAAC;gBACF,CAAC;YACF,CAAC,CAAC;YAEF,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC;YACpD,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,gBAAgB,CAAC,SAAsB,EAAE,QAAkB;QAClE,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;YAC/D,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;YAChD,MAAM,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;YAC5C,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,qBAAqB,CAAC,SAAsB;QACnD,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE;YACnE,EAAE,CAAC,eAAe,CAAC,iBAAiB,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;IACJ,CAAC;IAEO,iBAAiB;QACxB,MAAM,IAAI,GAAG,IAAI,CAAC,gBAAgB,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC9D,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC9D,CAAC;CACD;AA/QD,8CA+QC"}
@@ -0,0 +1,20 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ExtractorResult } from '../types/extractors';
3
+ export declare class XOembedExtractor extends BaseExtractor {
4
+ canExtract(): boolean;
5
+ extract(): ExtractorResult;
6
+ canExtractAsync(): boolean;
7
+ extractAsync(): Promise<ExtractorResult>;
8
+ private extractOembed;
9
+ private tryExtractFxTwitter;
10
+ private fetchFxTwitter;
11
+ private buildArticleResult;
12
+ private buildTweetResult;
13
+ private renderTweet;
14
+ private applyMarkers;
15
+ private applyFacets;
16
+ private renderArticle;
17
+ private renderBlock;
18
+ private renderAtomicBlock;
19
+ private renderInlineContent;
20
+ }
@@ -0,0 +1,350 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.XOembedExtractor = void 0;
4
+ const _base_1 = require("./_base");
5
+ const dom_1 = require("../utils/dom");
6
+ class XOembedExtractor extends _base_1.BaseExtractor {
7
+ canExtract() {
8
+ return false;
9
+ }
10
+ extract() {
11
+ return {
12
+ content: '',
13
+ contentHtml: '',
14
+ };
15
+ }
16
+ canExtractAsync() {
17
+ return /\/(status|article)\/\d+/.test(this.url);
18
+ }
19
+ async extractAsync() {
20
+ // Try FxTwitter first — it has full tweet text and media
21
+ const fxResult = await this.tryExtractFxTwitter();
22
+ if (fxResult) {
23
+ return fxResult;
24
+ }
25
+ // Fall back to oEmbed (truncates long tweets but always available)
26
+ return this.extractOembed();
27
+ }
28
+ async extractOembed() {
29
+ const oembedUrl = `https://publish.twitter.com/oembed?url=${encodeURIComponent(this.url)}&omit_script=true`;
30
+ const response = await fetch(oembedUrl);
31
+ if (!response.ok) {
32
+ throw new Error(`oEmbed request failed: ${response.status}`);
33
+ }
34
+ const data = await response.json();
35
+ // Parse the oEmbed HTML to extract tweet text
36
+ const div = this.document.createElement('div');
37
+ div.appendChild((0, dom_1.parseHTML)(this.document, data.html));
38
+ // The oEmbed HTML contains a <blockquote> with <p> tags for text
39
+ // and an <a> tag for the date
40
+ const blockquote = div.querySelector('blockquote');
41
+ const paragraphs = blockquote?.querySelectorAll('p') || [];
42
+ const tweetText = Array.from(paragraphs)
43
+ .map(p => `<p>${(0, dom_1.serializeHTML)(p)}</p>`)
44
+ .join('\n');
45
+ const handle = data.author_url
46
+ ? `@${data.author_url.split('/').pop()}`
47
+ : '';
48
+ const dateLink = blockquote?.querySelector('a:last-child');
49
+ const dateText = dateLink?.textContent?.trim() || '';
50
+ const permalink = dateLink?.getAttribute('href') || this.url;
51
+ const escapedAuthorName = (0, dom_1.escapeHtml)(data.author_name);
52
+ const escapedHandle = (0, dom_1.escapeHtml)(handle);
53
+ const escapedDateText = (0, dom_1.escapeHtml)(dateText);
54
+ const escapedPermalink = (0, dom_1.escapeHtml)(permalink);
55
+ const contentHtml = `
56
+ <div class="tweet-thread">
57
+ <div class="main-tweet">
58
+ <div class="tweet">
59
+ <div class="tweet-header">
60
+ <span class="tweet-author"><strong>${escapedAuthorName}</strong> <span class="tweet-handle">${escapedHandle}</span></span>
61
+ ${dateText ? `<a href="${escapedPermalink}" class="tweet-date">${escapedDateText}</a>` : ''}
62
+ </div>
63
+ ${tweetText ? `<div class="tweet-text">${tweetText}</div>` : ''}
64
+ </div>
65
+ </div>
66
+ </div>
67
+ `.trim();
68
+ return {
69
+ content: contentHtml,
70
+ contentHtml: contentHtml,
71
+ variables: {
72
+ title: `Post by ${handle || data.author_name}`,
73
+ author: handle || data.author_name,
74
+ site: 'X (Twitter)',
75
+ }
76
+ };
77
+ }
78
+ async tryExtractFxTwitter() {
79
+ const match = this.url.match(/\/([a-zA-Z][a-zA-Z0-9_]{0,14})\/(status|article)\/(\d+)/);
80
+ if (!match)
81
+ return null;
82
+ try {
83
+ const data = await this.fetchFxTwitter(match[1], match[3]);
84
+ // If it's an article, use the rich article renderer
85
+ if (data.tweet?.article) {
86
+ return this.buildArticleResult(data);
87
+ }
88
+ // Otherwise use the full tweet text from FxTwitter
89
+ if (data.tweet?.text) {
90
+ return this.buildTweetResult(data);
91
+ }
92
+ return null;
93
+ }
94
+ catch {
95
+ return null;
96
+ }
97
+ }
98
+ async fetchFxTwitter(username, id) {
99
+ const apiUrl = `https://api.fxtwitter.com/${username}/status/${id}`;
100
+ const response = await fetch(apiUrl, {
101
+ headers: {
102
+ 'User-Agent': 'Mozilla/5.0 (compatible; Defuddle/1.0; +https://defuddle.md)',
103
+ },
104
+ });
105
+ if (!response.ok) {
106
+ throw new Error(`FxTwitter API request failed: ${response.status}`);
107
+ }
108
+ return response.json();
109
+ }
110
+ buildArticleResult(data) {
111
+ const article = data.tweet.article;
112
+ const { blocks, entityMap } = article.content;
113
+ const contentHtml = this.renderArticle(blocks, entityMap, article.cover_media);
114
+ const handle = `@${data.tweet.author.screen_name}`;
115
+ return {
116
+ content: contentHtml,
117
+ contentHtml,
118
+ variables: {
119
+ title: article.title,
120
+ author: handle,
121
+ site: 'X (Twitter)',
122
+ description: article.preview_text,
123
+ }
124
+ };
125
+ }
126
+ buildTweetResult(data) {
127
+ const tweet = data.tweet;
128
+ const handle = `@${tweet.author.screen_name}`;
129
+ const contentHtml = this.renderTweet(tweet);
130
+ return {
131
+ content: contentHtml,
132
+ contentHtml,
133
+ variables: {
134
+ title: `Post by ${handle}`,
135
+ author: handle,
136
+ site: 'X (Twitter)',
137
+ }
138
+ };
139
+ }
140
+ renderTweet(tweet) {
141
+ const text = tweet.raw_text?.text || tweet.text;
142
+ // Filter out media facets — FxTwitter already strips pic.twitter.com
143
+ // links from the text, so media facet indices are stale
144
+ const facets = (tweet.raw_text?.facets || []).filter(f => f.type !== 'media');
145
+ // Split text into paragraphs on double newlines
146
+ const paragraphs = text.split(/\n\n+/);
147
+ let offset = 0;
148
+ const htmlParts = [];
149
+ for (const para of paragraphs) {
150
+ const paraStart = text.indexOf(para, offset);
151
+ const paraEnd = paraStart + para.length;
152
+ offset = paraEnd;
153
+ // Check if this paragraph is a blockquote (starts with >)
154
+ const isBlockquote = para.trimStart().startsWith('>');
155
+ let paraText = isBlockquote ? para.trimStart().slice(1).trimStart() : para;
156
+ const paraTextStart = isBlockquote
157
+ ? paraStart + (para.length - para.trimStart().length) + 1 + (para.trimStart().slice(1).length - para.trimStart().slice(1).trimStart().length)
158
+ : paraStart;
159
+ // Apply facets within this paragraph
160
+ const rendered = this.applyFacets(paraText, paraTextStart, paraEnd, facets);
161
+ // Handle line breaks within paragraph
162
+ const withBreaks = rendered.replace(/\n/g, '<br>');
163
+ if (isBlockquote) {
164
+ htmlParts.push(`<blockquote><p>${withBreaks}</p></blockquote>`);
165
+ }
166
+ else if (withBreaks.trim()) {
167
+ htmlParts.push(`<p>${withBreaks}</p>`);
168
+ }
169
+ }
170
+ // Append media images
171
+ if (tweet.media?.photos) {
172
+ for (const photo of tweet.media.photos) {
173
+ htmlParts.push(`<img src="${(0, dom_1.escapeHtml)(photo.url)}" alt="">`);
174
+ }
175
+ }
176
+ const handle = (0, dom_1.escapeHtml)(`@${tweet.author.screen_name}`);
177
+ const authorName = (0, dom_1.escapeHtml)(tweet.author.name);
178
+ return `<div class="tweet-thread"><div class="main-tweet"><div class="tweet">` +
179
+ `<div class="tweet-header"><span class="tweet-author"><strong>${authorName}</strong> <span class="tweet-handle">${handle}</span></span></div>` +
180
+ `<div class="tweet-text">${htmlParts.join('\n')}</div>` +
181
+ `</div></div></div>`;
182
+ }
183
+ applyMarkers(text, markers) {
184
+ if (markers.length === 0) {
185
+ return (0, dom_1.escapeHtml)(text);
186
+ }
187
+ markers.sort((a, b) => {
188
+ if (a.offset !== b.offset)
189
+ return a.offset - b.offset;
190
+ if (a.type === 'close' && b.type === 'open')
191
+ return -1;
192
+ if (a.type === 'open' && b.type === 'close')
193
+ return 1;
194
+ return 0;
195
+ });
196
+ let result = '';
197
+ let pos = 0;
198
+ for (const marker of markers) {
199
+ if (marker.offset > pos) {
200
+ result += (0, dom_1.escapeHtml)(text.slice(pos, marker.offset));
201
+ }
202
+ result += marker.tag;
203
+ pos = marker.offset;
204
+ }
205
+ if (pos < text.length) {
206
+ result += (0, dom_1.escapeHtml)(text.slice(pos));
207
+ }
208
+ return result;
209
+ }
210
+ applyFacets(text, textStart, textEnd, facets) {
211
+ const markers = [];
212
+ for (const facet of facets) {
213
+ const [fStart, fEnd] = facet.indices;
214
+ if (fEnd <= textStart || fStart >= textEnd)
215
+ continue;
216
+ const relStart = Math.max(0, fStart - textStart);
217
+ const relEnd = Math.min(text.length, fEnd - textStart);
218
+ if (facet.type === 'italic') {
219
+ markers.push({ offset: relStart, type: 'open', tag: '<em>' });
220
+ markers.push({ offset: relEnd, type: 'close', tag: '</em>' });
221
+ }
222
+ else if (facet.type === 'mention' && facet.text) {
223
+ const url = `https://x.com/${(0, dom_1.escapeHtml)(facet.text)}`;
224
+ markers.push({ offset: relStart, type: 'open', tag: `<a href="${url}">` });
225
+ markers.push({ offset: relEnd, type: 'close', tag: '</a>' });
226
+ }
227
+ else if (facet.type === 'url' && facet.original) {
228
+ const url = (0, dom_1.escapeHtml)(facet.original);
229
+ markers.push({ offset: relStart, type: 'open', tag: `<a href="${url}">` });
230
+ markers.push({ offset: relEnd, type: 'close', tag: '</a>' });
231
+ }
232
+ }
233
+ return this.applyMarkers(text, markers);
234
+ }
235
+ renderArticle(blocks, entityMap, coverMedia) {
236
+ const parts = [];
237
+ // Add cover image if available
238
+ if (coverMedia?.media_info?.original_img_url) {
239
+ parts.push(`<img src="${(0, dom_1.escapeHtml)(coverMedia.media_info.original_img_url)}" alt="Cover image">`);
240
+ }
241
+ let i = 0;
242
+ while (i < blocks.length) {
243
+ const block = blocks[i];
244
+ if (block.type === 'unordered-list-item') {
245
+ // Group consecutive list items into a <ul>
246
+ const items = [];
247
+ while (i < blocks.length && blocks[i].type === 'unordered-list-item') {
248
+ items.push(`<li>${this.renderInlineContent(blocks[i], entityMap)}</li>`);
249
+ i++;
250
+ }
251
+ parts.push(`<ul>${items.join('')}</ul>`);
252
+ continue;
253
+ }
254
+ const html = this.renderBlock(block, entityMap);
255
+ if (html) {
256
+ parts.push(html);
257
+ }
258
+ i++;
259
+ }
260
+ return `<article class="x-article">${parts.join('')}</article>`;
261
+ }
262
+ renderBlock(block, entityMap) {
263
+ switch (block.type) {
264
+ case 'unstyled': {
265
+ if (!block.text.trim())
266
+ return '';
267
+ return `<p>${this.renderInlineContent(block, entityMap)}</p>`;
268
+ }
269
+ case 'header-two':
270
+ return `<h2>${this.renderInlineContent(block, entityMap)}</h2>`;
271
+ case 'header-three':
272
+ return `<h3>${this.renderInlineContent(block, entityMap)}</h3>`;
273
+ case 'atomic':
274
+ return this.renderAtomicBlock(block, entityMap);
275
+ default: {
276
+ if (!block.text.trim())
277
+ return '';
278
+ return `<p>${this.renderInlineContent(block, entityMap)}</p>`;
279
+ }
280
+ }
281
+ }
282
+ renderAtomicBlock(block, entityMap) {
283
+ if (block.entityRanges.length === 0)
284
+ return '';
285
+ const entityEntry = entityMap.find(e => e.key === String(block.entityRanges[0].key));
286
+ if (!entityEntry)
287
+ return '';
288
+ const entity = entityEntry.value;
289
+ switch (entity.type) {
290
+ case 'MEDIA': {
291
+ const caption = entity.data.caption;
292
+ if (caption) {
293
+ return `<figure><figcaption>${(0, dom_1.escapeHtml)(caption)}</figcaption></figure>`;
294
+ }
295
+ return '';
296
+ }
297
+ case 'MARKDOWN': {
298
+ const markdown = entity.data.markdown || '';
299
+ // Strip the wrapping ```...``` fences
300
+ const codeMatch = markdown.match(/^```(\w*)\n([\s\S]*?)\n?```$/);
301
+ if (codeMatch) {
302
+ const lang = codeMatch[1];
303
+ const code = codeMatch[2];
304
+ const langAttr = lang ? ` class="language-${(0, dom_1.escapeHtml)(lang)}" data-lang="${(0, dom_1.escapeHtml)(lang)}"` : '';
305
+ return `<pre><code${langAttr}>${(0, dom_1.escapeHtml)(code)}</code></pre>`;
306
+ }
307
+ return `<pre><code>${(0, dom_1.escapeHtml)(markdown)}</code></pre>`;
308
+ }
309
+ default:
310
+ return '';
311
+ }
312
+ }
313
+ renderInlineContent(block, entityMap) {
314
+ const text = block.text;
315
+ if (!text)
316
+ return '';
317
+ const markers = [];
318
+ for (const range of block.inlineStyleRanges) {
319
+ if (range.style === 'Bold') {
320
+ markers.push({ offset: range.offset, type: 'open', tag: '<strong>' });
321
+ markers.push({ offset: range.offset + range.length, type: 'close', tag: '</strong>' });
322
+ }
323
+ }
324
+ for (const range of block.entityRanges) {
325
+ const entityEntry = entityMap.find(e => e.key === String(range.key));
326
+ if (entityEntry?.value.type === 'LINK' && entityEntry.value.data.url) {
327
+ const url = (0, dom_1.escapeHtml)(entityEntry.value.data.url);
328
+ markers.push({ offset: range.offset, type: 'open', tag: `<a href="${url}">` });
329
+ markers.push({ offset: range.offset + range.length, type: 'close', tag: '</a>' });
330
+ }
331
+ }
332
+ if (block.data?.mentions) {
333
+ for (const mention of block.data.mentions) {
334
+ const url = `https://x.com/${(0, dom_1.escapeHtml)(mention.text)}`;
335
+ markers.push({ offset: mention.fromIndex, type: 'open', tag: `<a href="${url}">` });
336
+ markers.push({ offset: mention.toIndex, type: 'close', tag: '</a>' });
337
+ }
338
+ }
339
+ if (block.data?.urls) {
340
+ for (const urlData of block.data.urls) {
341
+ const url = (0, dom_1.escapeHtml)(urlData.text);
342
+ markers.push({ offset: urlData.fromIndex, type: 'open', tag: `<a href="${url}">` });
343
+ markers.push({ offset: urlData.toIndex, type: 'close', tag: '</a>' });
344
+ }
345
+ }
346
+ return this.applyMarkers(text, markers);
347
+ }
348
+ }
349
+ exports.XOembedExtractor = XOembedExtractor;
350
+ //# sourceMappingURL=x-oembed.js.map