@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
@@ -0,0 +1,197 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.RedditExtractor = void 0;
4
+ const _base_1 = require("./_base");
5
+ const dom_1 = require("../utils/dom");
6
+ const comments_1 = require("../utils/comments");
7
+ class RedditExtractor extends _base_1.BaseExtractor {
8
+ constructor(document, url) {
9
+ super(document, url);
10
+ this.shredditPost = document.querySelector('shreddit-post');
11
+ this.isOldReddit = !!document.querySelector('.thing.link');
12
+ }
13
+ canExtract() {
14
+ return !!this.shredditPost || this.isOldReddit;
15
+ }
16
+ canExtractAsync() {
17
+ // For new reddit comment pages, extract() returns empty content
18
+ // when shreddit-comment elements are missing (server-side fetch),
19
+ // causing parseAsync() to fall through to this async path.
20
+ return this.isCommentsPage() && !this.isOldReddit;
21
+ }
22
+ isCommentsPage() {
23
+ return /\/r\/.+\/comments\//.test(this.url);
24
+ }
25
+ async extractAsync() {
26
+ // Convert URL to old.reddit.com
27
+ const oldUrl = new URL(this.url);
28
+ oldUrl.hostname = 'old.reddit.com';
29
+ const response = await fetch(oldUrl.toString(), {
30
+ headers: {
31
+ 'User-Agent': 'Mozilla/5.0 (compatible; Defuddle/1.0)',
32
+ },
33
+ });
34
+ if (!response.ok) {
35
+ throw new Error(`Failed to fetch old.reddit.com: ${response.status}`);
36
+ }
37
+ const html = await response.text();
38
+ const Parser = this.document.defaultView?.DOMParser ?? (typeof DOMParser !== 'undefined' ? DOMParser : null);
39
+ if (!Parser) {
40
+ throw new Error('DOMParser is not available in this environment');
41
+ }
42
+ const doc = new Parser().parseFromString(html, 'text/html');
43
+ return this.extractOldReddit(doc);
44
+ }
45
+ extract() {
46
+ if (this.isOldReddit) {
47
+ return this.extractOldReddit(this.document);
48
+ }
49
+ // New reddit server-side HTML includes shreddit-post but not
50
+ // shreddit-comment elements (those require JS). Return empty
51
+ // so parseAsync() falls through to extractAsync() which fetches
52
+ // old.reddit.com with full content.
53
+ const hasComments = this.document.querySelectorAll('shreddit-comment').length > 0;
54
+ if (this.isCommentsPage() && !hasComments) {
55
+ return { content: '', contentHtml: '' };
56
+ }
57
+ const postContent = this.getPostContent();
58
+ const comments = this.options.includeReplies !== false ? this.extractComments() : '';
59
+ const contentHtml = this.createContentHtml(postContent, comments);
60
+ const postTitle = this.document.querySelector('h1')?.textContent?.trim() || '';
61
+ const subreddit = this.getSubreddit();
62
+ const postAuthor = this.getPostAuthor();
63
+ const description = this.createDescription(postContent);
64
+ return {
65
+ content: contentHtml,
66
+ contentHtml: contentHtml,
67
+ extractedContent: {
68
+ postId: this.getPostId(),
69
+ subreddit,
70
+ postAuthor,
71
+ },
72
+ variables: {
73
+ title: postTitle,
74
+ author: postAuthor,
75
+ site: `r/${subreddit}`,
76
+ description,
77
+ }
78
+ };
79
+ }
80
+ extractOldReddit(root) {
81
+ const thingLink = root.querySelector('.thing.link');
82
+ const postTitle = thingLink?.querySelector('a.title')?.textContent?.trim() || '';
83
+ const postAuthor = thingLink?.getAttribute('data-author') || '';
84
+ const subreddit = thingLink?.getAttribute('data-subreddit') || '';
85
+ const postBodyEl = thingLink?.querySelector('.usertext-body .md');
86
+ const postBody = postBodyEl ? (0, dom_1.serializeHTML)(postBodyEl) : '';
87
+ let comments = '';
88
+ if (this.options.includeReplies !== false) {
89
+ const commentArea = root.querySelector('.commentarea .sitetable');
90
+ const commentData = commentArea ? this.collectOldRedditComments(commentArea) : [];
91
+ comments = commentData.length > 0 ? (0, comments_1.buildCommentTree)(commentData) : '';
92
+ }
93
+ const contentHtml = this.createContentHtml(postBody, comments);
94
+ const description = this.createDescription(postBody);
95
+ return {
96
+ content: contentHtml,
97
+ contentHtml: contentHtml,
98
+ extractedContent: {
99
+ postId: this.getPostId(),
100
+ subreddit,
101
+ postAuthor,
102
+ },
103
+ variables: {
104
+ title: postTitle,
105
+ author: postAuthor,
106
+ site: `r/${subreddit}`,
107
+ description,
108
+ }
109
+ };
110
+ }
111
+ getPostContent() {
112
+ const textBodyEl = this.shredditPost?.querySelector('[slot="text-body"]');
113
+ const textBody = textBodyEl ? (0, dom_1.serializeHTML)(textBodyEl) : '';
114
+ const mediaBody = this.shredditPost?.querySelector('#post-image')?.outerHTML || '';
115
+ return textBody + mediaBody;
116
+ }
117
+ createContentHtml(postContent, comments) {
118
+ return (0, comments_1.buildContentHtml)('reddit', postContent, comments);
119
+ }
120
+ extractComments() {
121
+ const comments = Array.from(this.document.querySelectorAll('shreddit-comment'));
122
+ return this.processComments(comments);
123
+ }
124
+ getPostId() {
125
+ const match = this.url.match(/comments\/([a-zA-Z0-9]+)/);
126
+ return match?.[1] || '';
127
+ }
128
+ getSubreddit() {
129
+ const match = this.url.match(/\/r\/([^/]+)/);
130
+ return match?.[1] || '';
131
+ }
132
+ getPostAuthor() {
133
+ return this.shredditPost?.getAttribute('author') || '';
134
+ }
135
+ createDescription(postContent) {
136
+ if (!postContent)
137
+ return '';
138
+ const tempDiv = this.document.createElement('div');
139
+ tempDiv.appendChild((0, dom_1.parseHTML)(this.document, postContent));
140
+ return tempDiv.textContent?.trim()
141
+ .slice(0, 140)
142
+ .replace(/\s+/g, ' ') || '';
143
+ }
144
+ collectOldRedditComments(container, depth = 0) {
145
+ const result = [];
146
+ const comments = Array.from(container.querySelectorAll(':scope > .thing.comment'));
147
+ for (const comment of comments) {
148
+ const author = comment.getAttribute('data-author') || '';
149
+ const permalink = comment.getAttribute('data-permalink') || '';
150
+ const score = comment.querySelector('.entry .tagline .score.unvoted')?.textContent?.trim() || '';
151
+ const timeEl = comment.querySelector('.entry .tagline time[datetime]');
152
+ const datetime = timeEl?.getAttribute('datetime') || '';
153
+ const date = datetime ? new Date(datetime).toISOString().split('T')[0] : '';
154
+ const bodyEl = comment.querySelector('.entry .usertext-body .md');
155
+ const body = bodyEl ? (0, dom_1.serializeHTML)(bodyEl) : '';
156
+ result.push({
157
+ author,
158
+ date,
159
+ content: body,
160
+ depth,
161
+ score: score || undefined,
162
+ url: permalink ? `https://reddit.com${permalink}` : undefined,
163
+ });
164
+ const childContainer = comment.querySelector('.child > .sitetable');
165
+ if (childContainer) {
166
+ result.push(...this.collectOldRedditComments(childContainer, depth + 1));
167
+ }
168
+ }
169
+ return result;
170
+ }
171
+ processComments(comments) {
172
+ const commentData = [];
173
+ for (const comment of comments) {
174
+ const depth = parseInt(comment.getAttribute('depth') || '0');
175
+ const author = comment.getAttribute('author') || '';
176
+ const score = comment.getAttribute('score') || '0';
177
+ const permalink = comment.getAttribute('permalink') || '';
178
+ const commentEl = comment.querySelector('[slot="comment"]');
179
+ const content = commentEl ? (0, dom_1.serializeHTML)(commentEl) : '';
180
+ const timestamp = comment.getAttribute('created')
181
+ || comment.querySelector('time')?.getAttribute('datetime')
182
+ || '';
183
+ const date = timestamp ? new Date(timestamp).toISOString().split('T')[0] : '';
184
+ commentData.push({
185
+ author,
186
+ date,
187
+ content,
188
+ depth,
189
+ score: `${score} points`,
190
+ url: permalink ? `https://reddit.com${permalink}` : undefined,
191
+ });
192
+ }
193
+ return (0, comments_1.buildCommentTree)(commentData);
194
+ }
195
+ }
196
+ exports.RedditExtractor = RedditExtractor;
197
+ //# sourceMappingURL=reddit.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reddit.js","sourceRoot":"","sources":["../../src/extractors/reddit.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,sCAAwD;AACxD,gDAAyF;AAEzF,MAAa,eAAgB,SAAQ,qBAAa;IAIjD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACrB,IAAI,CAAC,YAAY,GAAG,QAAQ,CAAC,aAAa,CAAC,eAAe,CAAC,CAAC;QAC5D,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC,QAAQ,CAAC,aAAa,CAAC,aAAa,CAAC,CAAC;IAC5D,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,WAAW,CAAC;IAChD,CAAC;IAED,eAAe;QACd,gEAAgE;QAChE,kEAAkE;QAClE,2DAA2D;QAC3D,OAAO,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC;IACnD,CAAC;IAEO,cAAc;QACrB,OAAO,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC7C,CAAC;IAED,KAAK,CAAC,YAAY;QACjB,gCAAgC;QAChC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjC,MAAM,CAAC,QAAQ,GAAG,gBAAgB,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE;YAC/C,OAAO,EAAE;gBACR,YAAY,EAAE,wCAAwC;aACtD;SACD,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,mCAAmC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,SAAS,IAAI,CAAC,OAAO,SAAS,KAAK,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC7G,IAAI,CAAC,MAAM,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAC;QACnE,CAAC;QACD,MAAM,GAAG,GAAG,IAAI,MAAM,EAAE,CAAC,eAAe,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;QAE5D,OAAO,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;IACnC,CAAC;IAED,OAAO;QACN,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACtB,OAAO,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC7C,CAAC;QAED,6DAA6D;QAC7D,6DAA6D;QAC7D,gEAAgE;QAChE,oCAAoC;QACpC,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAClF,IAAI,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC;YAC3C,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,WAAW,EAAE,EAAE,EAAE,CAAC;QACzC,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAErF,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;QAClE,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC/E,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QACxC,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC;QAExD,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,MAAM,EAAE,IAAI,CAAC,SAAS,EAAE;gBACxB,SAAS;gBACT,UAAU;aACV;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,SAAS;gBAChB,MAAM,EAAE,UAAU;gBAClB,IAAI,EAAE,KAAK,SAAS,EAAE;gBACtB,WAAW;aACX;SACD,CAAC;IACH,CAAC;IAEO,gBAAgB,CAAC,IAAwB;QAChD,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,CAAC,CAAC;QACpD,MAAM,SAAS,GAAG,SAAS,EAAE,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACjF,MAAM,UAAU,GAAG,SAAS,EAAE,YAAY,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC;QAChE,MAAM,SAAS,GAAG,SAAS,EAAE,YAAY,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC;QAClE,MAAM,UAAU,GAAG,SAAS,EAAE,aAAa,CAAC,oBAAoB,CAAC,CAAC;QAClE,MAAM,QAAQ,GAAG,UAAU,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE7D,IAAI,QAAQ,GAAG,EAAE,CAAC;QAClB,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,KAAK,EAAE,CAAC;YAC3C,MAAM,WAAW,GAAG,IAAI,CAAC,aAAa,CAAC,yBAAyB,CAAC,CAAC;YAClE,MAAM,WAAW,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAClF,QAAQ,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAA,2BAAgB,EAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxE,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;QAC/D,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAErD,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,MAAM,EAAE,IAAI,CAAC,SAAS,EAAE;gBACxB,SAAS;gBACT,UAAU;aACV;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,SAAS;gBAChB,MAAM,EAAE,UAAU;gBAClB,IAAI,EAAE,KAAK,SAAS,EAAE;gBACtB,WAAW;aACX;SACD,CAAC;IACH,CAAC;IAEO,cAAc;QACrB,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,EAAE,aAAa,CAAC,oBAAoB,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,UAAU,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,aAAa,CAAC,aAAa,CAAC,EAAE,SAAS,IAAI,EAAE,CAAC;QAEnF,OAAO,QAAQ,GAAG,SAAS,CAAC;IAC7B,CAAC;IAEO,iBAAiB,CAAC,WAAmB,EAAE,QAAgB;QAC9D,OAAO,IAAA,2BAAgB,EAAC,QAAQ,EAAE,WAAW,EAAE,QAAQ,CAAC,CAAC;IAC1D,CAAC;IAEO,eAAe;QACtB,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC,CAAC;QAChF,OAAO,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC;IAEO,SAAS;QAChB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QACzD,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAEO,YAAY;QACnB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC7C,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAEO,aAAa;QACpB,OAAO,IAAI,CAAC,YAAY,EAAE,YAAY,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;IACxD,CAAC;IAEO,iBAAiB,CAAC,WAAmB;QAC5C,IAAI,CAAC,WAAW;YAAE,OAAO,EAAE,CAAC;QAE5B,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACnD,OAAO,CAAC,WAAW,CAAC,IAAA,eAAS,EAAC,IAAI,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAC,CAAC;QAC3D,OAAO,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE;aAChC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;aACb,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC;IAC9B,CAAC;IAEO,wBAAwB,CAAC,SAAkB,EAAE,QAAgB,CAAC;QACrE,MAAM,MAAM,GAAkB,EAAE,CAAC;QACjC,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,yBAAyB,CAAC,CAAC,CAAC;QAEnF,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,MAAM,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC;YACzD,MAAM,SAAS,GAAG,OAAO,CAAC,YAAY,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC;YAC/D,MAAM,KAAK,GAAG,OAAO,CAAC,aAAa,CAAC,gCAAgC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YACjG,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,gCAAgC,CAAC,CAAC;YACvE,MAAM,QAAQ,GAAG,MAAM,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;YACxD,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5E,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,2BAA2B,CAAC,CAAC;YAClE,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAEjD,MAAM,CAAC,IAAI,CAAC;gBACX,MAAM;gBACN,IAAI;gBACJ,OAAO,EAAE,IAAI;gBACb,KAAK;gBACL,KAAK,EAAE,KAAK,IAAI,SAAS;gBACzB,GAAG,EAAE,SAAS,CAAC,CAAC,CAAC,qBAAqB,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS;aAC7D,CAAC,CAAC;YAEH,MAAM,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC,qBAAqB,CAAC,CAAC;YACpE,IAAI,cAAc,EAAE,CAAC;gBACpB,MAAM,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,wBAAwB,CAAC,cAAc,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC;YAC1E,CAAC;QACF,CAAC;QAED,OAAO,MAAM,CAAC;IACf,CAAC;IAEO,eAAe,CAAC,QAAmB;QAC1C,MAAM,WAAW,GAAkB,EAAE,CAAC;QAEtC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,CAAC;YAC7D,MAAM,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACpD,MAAM,KAAK,GAAG,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YACnD,MAAM,SAAS,GAAG,OAAO,CAAC,YAAY,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;YAC1D,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,kBAAkB,CAAC,CAAC;YAC5D,MAAM,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAE1D,MAAM,SAAS,GAAG,OAAO,CAAC,YAAY,CAAC,SAAS,CAAC;mBAC7C,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,EAAE,YAAY,CAAC,UAAU,CAAC;mBACvD,EAAE,CAAC;YACP,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAE9E,WAAW,CAAC,IAAI,CAAC;gBAChB,MAAM;gBACN,IAAI;gBACJ,OAAO;gBACP,KAAK;gBACL,KAAK,EAAE,GAAG,KAAK,SAAS;gBACxB,GAAG,EAAE,SAAS,CAAC,CAAC,CAAC,qBAAqB,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS;aAC7D,CAAC,CAAC;QACJ,CAAC;QAED,OAAO,IAAA,2BAAgB,EAAC,WAAW,CAAC,CAAC;IACtC,CAAC;CACD;AAlOD,0CAkOC"}
@@ -0,0 +1,16 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ExtractorResult } from '../types/extractors';
3
+ export declare class TwitterExtractor extends BaseExtractor {
4
+ private mainTweet;
5
+ private threadTweets;
6
+ constructor(document: Document, url: string);
7
+ canExtract(): boolean;
8
+ extract(): ExtractorResult;
9
+ private formatTweetText;
10
+ private extractTweet;
11
+ private extractUserInfo;
12
+ private extractImages;
13
+ private getTweetId;
14
+ private getTweetAuthor;
15
+ private createDescription;
16
+ }
@@ -0,0 +1,204 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.TwitterExtractor = void 0;
4
+ const _base_1 = require("./_base");
5
+ const dom_1 = require("../utils/dom");
6
+ class TwitterExtractor extends _base_1.BaseExtractor {
7
+ constructor(document, url) {
8
+ super(document, url);
9
+ this.mainTweet = null;
10
+ this.threadTweets = [];
11
+ // Get all tweets from the timeline
12
+ const timeline = document.querySelector('[aria-label="Timeline: Conversation"]');
13
+ if (!timeline) {
14
+ // Try to find a single tweet if not in timeline view
15
+ const singleTweet = document.querySelector('article[data-testid="tweet"]');
16
+ if (singleTweet) {
17
+ this.mainTweet = singleTweet;
18
+ }
19
+ return;
20
+ }
21
+ // Get all tweets before any section with "Discover more" or similar headings
22
+ let allTweets = Array.from(timeline.querySelectorAll('article[data-testid="tweet"]'));
23
+ const firstSection = timeline.querySelector('section, h2')?.parentElement;
24
+ if (firstSection) {
25
+ // Filter out tweets that appear after the first section
26
+ const cutoffIndex = allTweets.findIndex(tweet => firstSection.compareDocumentPosition(tweet) & Node.DOCUMENT_POSITION_FOLLOWING);
27
+ if (cutoffIndex !== -1) {
28
+ allTweets = allTweets.slice(0, cutoffIndex);
29
+ }
30
+ }
31
+ // Set main tweet and thread tweets
32
+ this.mainTweet = allTweets[0] || null;
33
+ this.threadTweets = allTweets.slice(1);
34
+ }
35
+ canExtract() {
36
+ return !!this.mainTweet;
37
+ }
38
+ extract() {
39
+ const mainContent = this.extractTweet(this.mainTweet);
40
+ const threadContent = this.options.includeReplies !== false
41
+ ? this.threadTweets.map(tweet => this.extractTweet(tweet)).join('\n<hr>\n')
42
+ : '';
43
+ const contentHtml = `
44
+ <div class="tweet-thread">
45
+ <div class="main-tweet">
46
+ ${mainContent}
47
+ </div>
48
+ ${threadContent ? `
49
+ <hr>
50
+ <div class="thread-tweets">
51
+ ${threadContent}
52
+ </div>
53
+ ` : ''}
54
+ </div>
55
+ `.trim();
56
+ const tweetId = this.getTweetId();
57
+ const tweetAuthor = this.getTweetAuthor();
58
+ const description = this.createDescription(this.mainTweet);
59
+ return {
60
+ content: contentHtml,
61
+ contentHtml: contentHtml,
62
+ extractedContent: {
63
+ tweetId,
64
+ tweetAuthor,
65
+ },
66
+ variables: {
67
+ title: `Thread by ${tweetAuthor}`,
68
+ author: tweetAuthor,
69
+ site: 'X (Twitter)',
70
+ description,
71
+ }
72
+ };
73
+ }
74
+ formatTweetText(text) {
75
+ if (!text)
76
+ return '';
77
+ // Create a temporary div to parse and clean the HTML
78
+ const tempDiv = this.document.createElement('div');
79
+ tempDiv.appendChild((0, dom_1.parseHTML)(this.document, text));
80
+ // Convert links to plain text with @ handles
81
+ tempDiv.querySelectorAll('a').forEach(link => {
82
+ const handle = link.textContent?.trim() || '';
83
+ link.replaceWith(handle);
84
+ });
85
+ // Remove unnecessary spans and divs but keep their content
86
+ tempDiv.querySelectorAll('span, div').forEach(element => {
87
+ element.replaceWith(...Array.from(element.childNodes));
88
+ });
89
+ // Get cleaned text and split into paragraphs
90
+ const cleanText = (0, dom_1.serializeHTML)(tempDiv);
91
+ const paragraphs = cleanText.split('\n')
92
+ .map(line => line.trim())
93
+ .filter(line => line);
94
+ // Wrap each paragraph in <p> tags
95
+ return paragraphs.map(p => `<p>${p}</p>`).join('\n');
96
+ }
97
+ extractTweet(tweet) {
98
+ if (!tweet)
99
+ return '';
100
+ // Clone the tweet element to modify it
101
+ const tweetClone = tweet.cloneNode(true);
102
+ // Convert emoji images to text
103
+ tweetClone.querySelectorAll('img[src*="/emoji/"]').forEach(img => {
104
+ if (img.tagName.toLowerCase() === 'img' && img.getAttribute('alt')) {
105
+ const altText = img.getAttribute('alt');
106
+ if (altText) {
107
+ img.replaceWith(altText);
108
+ }
109
+ }
110
+ });
111
+ const tweetTextEl = tweetClone.querySelector('[data-testid="tweetText"]');
112
+ const tweetText = tweetTextEl ? (0, dom_1.serializeHTML)(tweetTextEl) : '';
113
+ const formattedText = this.formatTweetText(tweetText);
114
+ const images = this.extractImages(tweet);
115
+ // Get author info and date
116
+ const userInfo = this.extractUserInfo(tweet);
117
+ // Extract quoted tweet if present
118
+ const quotedTweet = tweet.querySelector('[aria-labelledby*="id__"]')?.querySelector('[data-testid="User-Name"]')?.closest('[aria-labelledby*="id__"]');
119
+ const quotedContent = quotedTweet ? this.extractTweet(quotedTweet) : '';
120
+ return `
121
+ <div class="tweet">
122
+ <div class="tweet-header">
123
+ <span class="tweet-author"><strong>${userInfo.fullName}</strong> <span class="tweet-handle">${userInfo.handle}</span></span>
124
+ ${userInfo.date ? `<a href="${userInfo.permalink}" class="tweet-date">${userInfo.date}</a>` : ''}
125
+ </div>
126
+ ${formattedText ? `<div class="tweet-text">${formattedText}</div>` : ''}
127
+ ${images.length ? `
128
+ <div class="tweet-media">
129
+ ${images.join('\n')}
130
+ </div>
131
+ ` : ''}
132
+ ${quotedContent ? `
133
+ <blockquote class="quoted-tweet">
134
+ ${quotedContent}
135
+ </blockquote>
136
+ ` : ''}
137
+ </div>
138
+ `.trim();
139
+ }
140
+ extractUserInfo(tweet) {
141
+ const nameElement = tweet.querySelector('[data-testid="User-Name"]');
142
+ if (!nameElement)
143
+ return { fullName: '', handle: '', date: '', permalink: '' };
144
+ // Try to get name and handle from links first (main tweet structure)
145
+ const links = nameElement.querySelectorAll('a');
146
+ let fullName = links?.[0]?.textContent?.trim() || '';
147
+ let handle = links?.[1]?.textContent?.trim() || '';
148
+ // If links don't have the info, try to get from spans (quoted tweet structure)
149
+ if (!fullName || !handle) {
150
+ fullName = nameElement.querySelector('span[style*="color: rgb(15, 20, 25)"] span')?.textContent?.trim() || '';
151
+ handle = nameElement.querySelector('span[style*="color: rgb(83, 100, 113)"]')?.textContent?.trim() || '';
152
+ }
153
+ const timestamp = tweet.querySelector('time');
154
+ const datetime = timestamp?.getAttribute('datetime') || '';
155
+ const date = datetime ? new Date(datetime).toISOString().split('T')[0] : '';
156
+ const permalink = timestamp?.closest('a')?.href || '';
157
+ return { fullName, handle, date, permalink };
158
+ }
159
+ extractImages(tweet) {
160
+ // Look for images in different containers
161
+ const imageContainers = [
162
+ '[data-testid="tweetPhoto"]',
163
+ '[data-testid="tweet-image"]',
164
+ 'img[src*="media"]'
165
+ ];
166
+ const images = [];
167
+ // Skip images that are inside quoted tweets
168
+ const quotedTweet = tweet.querySelector('[aria-labelledby*="id__"]')?.querySelector('[data-testid="User-Name"]')?.closest('[aria-labelledby*="id__"]');
169
+ for (const selector of imageContainers) {
170
+ const elements = tweet.querySelectorAll(selector);
171
+ elements.forEach(img => {
172
+ // Skip if the image is inside a quoted tweet
173
+ if (quotedTweet?.contains(img)) {
174
+ return;
175
+ }
176
+ // Check if element is an image by checking tag name and required properties
177
+ if (img.tagName.toLowerCase() === 'img' && img.getAttribute('alt')) {
178
+ const highQualitySrc = img.getAttribute('src')?.replace(/&name=\w+$/, '&name=large') || '';
179
+ const cleanAlt = img.getAttribute('alt')?.replace(/\s+/g, ' ').trim() || '';
180
+ images.push(`<img src="${highQualitySrc}" alt="${cleanAlt}" />`);
181
+ }
182
+ });
183
+ }
184
+ return images;
185
+ }
186
+ getTweetId() {
187
+ const match = this.url.match(/status\/(\d+)/);
188
+ return match?.[1] || '';
189
+ }
190
+ getTweetAuthor() {
191
+ const nameElement = this.mainTweet?.querySelector('[data-testid="User-Name"]');
192
+ const links = nameElement?.querySelectorAll('a');
193
+ const handle = links?.[1]?.textContent?.trim() || '';
194
+ return handle.startsWith('@') ? handle : `@${handle}`;
195
+ }
196
+ createDescription(tweet) {
197
+ if (!tweet)
198
+ return '';
199
+ const tweetText = tweet.querySelector('[data-testid="tweetText"]')?.textContent || '';
200
+ return tweetText.trim().slice(0, 140).replace(/\s+/g, ' ');
201
+ }
202
+ }
203
+ exports.TwitterExtractor = TwitterExtractor;
204
+ //# sourceMappingURL=twitter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"twitter.js","sourceRoot":"","sources":["../../src/extractors/twitter.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,sCAAwD;AAExD,MAAa,gBAAiB,SAAQ,qBAAa;IAIlD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAJd,cAAS,GAAmB,IAAI,CAAC;QACjC,iBAAY,GAAc,EAAE,CAAC;QAKpC,mCAAmC;QACnC,MAAM,QAAQ,GAAG,QAAQ,CAAC,aAAa,CAAC,uCAAuC,CAAC,CAAC;QACjF,IAAI,CAAC,QAAQ,EAAE,CAAC;YACf,qDAAqD;YACrD,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,8BAA8B,CAAC,CAAC;YAC3E,IAAI,WAAW,EAAE,CAAC;gBACjB,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC;YAC9B,CAAC;YACD,OAAO;QACR,CAAC;QAED,6EAA6E;QAC7E,IAAI,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,8BAA8B,CAAC,CAAC,CAAC;QACtF,MAAM,YAAY,GAAG,QAAQ,CAAC,aAAa,CAAC,aAAa,CAAC,EAAE,aAAa,CAAC;QAE1E,IAAI,YAAY,EAAE,CAAC;YAClB,wDAAwD;YACxD,MAAM,WAAW,GAAG,SAAS,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAC/C,YAAY,CAAC,uBAAuB,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,2BAA2B,CAC9E,CAAC;YACF,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC,CAAC;YAC7C,CAAC;QACF,CAAC;QAED,mCAAmC;QACnC,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACtC,IAAI,CAAC,YAAY,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACxC,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;IACzB,CAAC;IAED,OAAO;QACN,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACtD,MAAM,aAAa,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,KAAK;YAC1D,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC;YAC3E,CAAC,CAAC,EAAE,CAAC;QAEN,MAAM,WAAW,GAAG;;;OAGf,WAAW;;MAEZ,aAAa,CAAC,CAAC,CAAC;;;QAGd,aAAa;;KAEhB,CAAC,CAAC,CAAC,EAAE;;GAEP,CAAC,IAAI,EAAE,CAAC;QAET,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,EAAE,CAAC;QAClC,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAE3D,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,OAAO;gBACP,WAAW;aACX;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,aAAa,WAAW,EAAE;gBACjC,MAAM,EAAE,WAAW;gBACnB,IAAI,EAAE,aAAa;gBACnB,WAAW;aACX;SACD,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,IAAY;QACnC,IAAI,CAAC,IAAI;YAAE,OAAO,EAAE,CAAC;QAErB,qDAAqD;QACrD,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACnD,OAAO,CAAC,WAAW,CAAC,IAAA,eAAS,EAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC;QAEpD,6CAA6C;QAC7C,OAAO,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;YAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC9C,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,2DAA2D;QAC3D,OAAO,CAAC,gBAAgB,CAAC,WAAW,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE;YACvD,OAAO,CAAC,WAAW,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC;QACxD,CAAC,CAAC,CAAC;QAEH,6CAA6C;QAC7C,MAAM,SAAS,GAAG,IAAA,mBAAa,EAAC,OAAO,CAAC,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC;aACtC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;aACxB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;QAEvB,kCAAkC;QAClC,OAAO,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtD,CAAC;IAEO,YAAY,CAAC,KAAqB;QACzC,IAAI,CAAC,KAAK;YAAE,OAAO,EAAE,CAAC;QAEtB,uCAAuC;QACvC,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;QAEpD,+BAA+B;QAC/B,UAAU,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YAChE,IAAI,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,KAAK,IAAI,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;gBACpE,MAAM,OAAO,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;gBACxC,IAAI,OAAO,EAAE,CAAC;oBACb,GAAG,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;gBAC1B,CAAC;YACF,CAAC;QACF,CAAC,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,UAAU,CAAC,aAAa,CAAC,2BAA2B,CAAC,CAAC;QAC1E,MAAM,SAAS,GAAG,WAAW,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAChE,MAAM,aAAa,GAAG,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QACtD,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QAEzC,2BAA2B;QAC3B,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QAE7C,kCAAkC;QAClC,MAAM,WAAW,GAAG,KAAK,CAAC,aAAa,CAAC,2BAA2B,CAAC,EAAE,aAAa,CAAC,2BAA2B,CAAC,EAAE,OAAO,CAAC,2BAA2B,CAAC,CAAC;QACvJ,MAAM,aAAa,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAExE,OAAO;;;0CAGiC,QAAQ,CAAC,QAAQ,wCAAwC,QAAQ,CAAC,MAAM;OAC3G,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,YAAY,QAAQ,CAAC,SAAS,wBAAwB,QAAQ,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE;;MAE/F,aAAa,CAAC,CAAC,CAAC,2BAA2B,aAAa,QAAQ,CAAC,CAAC,CAAC,EAAE;MACrE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;;QAEd,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;;KAEpB,CAAC,CAAC,CAAC,EAAE;MACJ,aAAa,CAAC,CAAC,CAAC;;QAEd,aAAa;;KAEhB,CAAC,CAAC,CAAC,EAAE;;GAEP,CAAC,IAAI,EAAE,CAAC;IACV,CAAC;IAEO,eAAe,CAAC,KAAc;QACrC,MAAM,WAAW,GAAG,KAAK,CAAC,aAAa,CAAC,2BAA2B,CAAC,CAAC;QACrE,IAAI,CAAC,WAAW;YAAE,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;QAE/E,qEAAqE;QACrE,MAAM,KAAK,GAAG,WAAW,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;QAChD,IAAI,QAAQ,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACrD,IAAI,MAAM,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAEnD,+EAA+E;QAC/E,IAAI,CAAC,QAAQ,IAAI,CAAC,MAAM,EAAE,CAAC;YAC1B,QAAQ,GAAG,WAAW,CAAC,aAAa,CAAC,4CAA4C,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAC9G,MAAM,GAAG,WAAW,CAAC,aAAa,CAAC,yCAAyC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC1G,CAAC;QAED,MAAM,SAAS,GAAG,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAC9C,MAAM,QAAQ,GAAG,SAAS,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;QAC3D,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC5E,MAAM,SAAS,GAAG,SAAS,EAAE,OAAO,CAAC,GAAG,CAAC,EAAE,IAAI,IAAI,EAAE,CAAC;QAEtD,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;IAC9C,CAAC;IAEO,aAAa,CAAC,KAAc;QACnC,0CAA0C;QAC1C,MAAM,eAAe,GAAG;YACvB,4BAA4B;YAC5B,6BAA6B;YAC7B,mBAAmB;SACnB,CAAC;QAEF,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,4CAA4C;QAC5C,MAAM,WAAW,GAAG,KAAK,CAAC,aAAa,CAAC,2BAA2B,CAAC,EAAE,aAAa,CAAC,2BAA2B,CAAC,EAAE,OAAO,CAAC,2BAA2B,CAAC,CAAC;QAEvJ,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;YACxC,MAAM,QAAQ,GAAG,KAAK,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YAElD,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;gBACtB,6CAA6C;gBAC7C,IAAI,WAAW,EAAE,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;oBAChC,OAAO;gBACR,CAAC;gBAED,4EAA4E;gBAC5E,IAAI,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,KAAK,IAAI,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;oBACpE,MAAM,cAAc,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,YAAY,EAAE,aAAa,CAAC,IAAI,EAAE,CAAC;oBAC3F,MAAM,QAAQ,GAAG,GAAG,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;oBAC5E,MAAM,CAAC,IAAI,CAAC,aAAa,cAAc,UAAU,QAAQ,MAAM,CAAC,CAAC;gBAClE,CAAC;YACF,CAAC,CAAC,CAAC;QACJ,CAAC;QAED,OAAO,MAAM,CAAC;IACf,CAAC;IAEO,UAAU;QACjB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC;QAC9C,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAEO,cAAc;QACrB,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,2BAA2B,CAAC,CAAC;QAC/E,MAAM,KAAK,GAAG,WAAW,EAAE,gBAAgB,CAAC,GAAG,CAAC,CAAC;QACjD,MAAM,MAAM,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACrD,OAAO,MAAM,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,MAAM,EAAE,CAAC;IACvD,CAAC;IAEO,iBAAiB,CAAC,KAAqB;QAC9C,IAAI,CAAC,KAAK;YAAE,OAAO,EAAE,CAAC;QAEtB,MAAM,SAAS,GAAG,KAAK,CAAC,aAAa,CAAC,2BAA2B,CAAC,EAAE,WAAW,IAAI,EAAE,CAAC;QACtF,OAAO,SAAS,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC5D,CAAC;CACD;AAzOD,4CAyOC"}
@@ -0,0 +1,24 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ExtractorResult } from '../types/extractors';
3
+ export declare class XArticleExtractor extends BaseExtractor {
4
+ private articleContainer;
5
+ constructor(document: Document, url: string, schemaOrgData?: any);
6
+ canExtract(): boolean;
7
+ extract(): ExtractorResult;
8
+ private extractTitle;
9
+ private extractAuthor;
10
+ private getAuthorFromUrl;
11
+ private getAuthorFromOgTitle;
12
+ private getArticleId;
13
+ private extractContent;
14
+ private cleanContent;
15
+ private convertEmbeddedTweets;
16
+ private convertCodeBlocks;
17
+ private convertHeaders;
18
+ private unwrapLinkedImages;
19
+ private upgradeImageQuality;
20
+ private convertDraftParagraphs;
21
+ private convertBoldSpans;
22
+ private removeDraftAttributes;
23
+ private createDescription;
24
+ }