@mz1999/defuddle 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +371 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +145 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +24 -0
- package/dist/constants.js +950 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +136 -0
- package/dist/defuddle.js +1816 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.d.ts +5 -0
- package/dist/elements/code.js +346 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/footnotes.d.ts +5 -0
- package/dist/elements/footnotes.js +619 -0
- package/dist/elements/footnotes.js.map +1 -0
- package/dist/elements/headings.d.ts +11 -0
- package/dist/elements/headings.js +100 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +877 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.base.d.ts +9 -0
- package/dist/elements/math.base.js +195 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.core.d.ts +7 -0
- package/dist/elements/math.core.js +52 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.d.ts +2 -0
- package/dist/elements/math.full.d.ts +8 -0
- package/dist/elements/math.js +7 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/extractor-registry.d.ts +16 -0
- package/dist/extractor-registry.js +140 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +22 -0
- package/dist/extractors/_base.js +27 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +78 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +14 -0
- package/dist/extractors/chatgpt.js +138 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +91 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/gemini.d.ts +14 -0
- package/dist/extractors/gemini.js +111 -0
- package/dist/extractors/gemini.js.map +1 -0
- package/dist/extractors/github.d.ts +20 -0
- package/dist/extractors/github.js +251 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +142 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +155 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +22 -0
- package/dist/extractors/reddit.js +197 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +204 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/x-article.d.ts +24 -0
- package/dist/extractors/x-article.js +267 -0
- package/dist/extractors/x-article.js.map +1 -0
- package/dist/extractors/x-oembed.d.ts +20 -0
- package/dist/extractors/x-oembed.js +350 -0
- package/dist/extractors/x-oembed.js.map +1 -0
- package/dist/extractors/youtube.d.ts +87 -0
- package/dist/extractors/youtube.js +869 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/fetch.d.ts +18 -0
- package/dist/fetch.js +265 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.full.d.ts +12 -0
- package/dist/index.full.js +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +30 -0
- package/dist/markdown.js +661 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.d.ts +25 -0
- package/dist/metadata.js +426 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +19 -0
- package/dist/node.js +78 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +31 -0
- package/dist/scoring.js +472 -0
- package/dist/scoring.js.map +1 -0
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +1101 -0
- package/dist/standardize.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +135 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +42 -0
- package/dist/utils/dom.js +104 -0
- package/dist/utils/dom.js.map +1 -0
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +98 -0
- package/dist/utils.js.map +1 -0
- package/package.json +107 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.GitHubExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
const comments_1 = require("../utils/comments");
|
|
7
|
+
class GitHubExtractor extends _base_1.BaseExtractor {
|
|
8
|
+
constructor(document, url) {
|
|
9
|
+
super(document, url);
|
|
10
|
+
this.isIssue = /\/issues\/\d+/.test(url);
|
|
11
|
+
this.isPR = /\/pull\/\d+/.test(url);
|
|
12
|
+
}
|
|
13
|
+
canExtract() {
|
|
14
|
+
const githubIndicators = [
|
|
15
|
+
'meta[name="expected-hostname"][content="github.com"]',
|
|
16
|
+
'meta[name="octolytics-url"]',
|
|
17
|
+
'meta[name="github-keyboard-shortcuts"]',
|
|
18
|
+
'.js-header-wrapper',
|
|
19
|
+
'#js-repo-pjax-container',
|
|
20
|
+
];
|
|
21
|
+
if (!githubIndicators.some(selector => this.document.querySelector(selector) !== null)) {
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
if (this.isIssue) {
|
|
25
|
+
return [
|
|
26
|
+
'[data-testid="issue-metadata-sticky"]',
|
|
27
|
+
'[data-testid="issue-title"]',
|
|
28
|
+
].some(selector => this.document.querySelector(selector) !== null);
|
|
29
|
+
}
|
|
30
|
+
if (this.isPR) {
|
|
31
|
+
return [
|
|
32
|
+
'.pull-discussion-timeline',
|
|
33
|
+
'.discussion-timeline',
|
|
34
|
+
'.gh-header-title',
|
|
35
|
+
'.js-issue-title',
|
|
36
|
+
].some(selector => this.document.querySelector(selector) !== null);
|
|
37
|
+
}
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
extract() {
|
|
41
|
+
const repoInfo = this.extractRepoInfo();
|
|
42
|
+
const number = this.extractNumber();
|
|
43
|
+
const type = this.isPR ? 'pull' : 'issue';
|
|
44
|
+
const prBody = this.isPR ? this.getPRBody() : null;
|
|
45
|
+
const { content: postContent, author, published } = this.isPR
|
|
46
|
+
? this.getPRContent(prBody)
|
|
47
|
+
: this.getIssueContent();
|
|
48
|
+
const comments = this.options.includeReplies !== false
|
|
49
|
+
? (this.isPR ? this.extractPRComments(prBody) : this.extractComments())
|
|
50
|
+
: '';
|
|
51
|
+
const contentHtml = this.createContentHtml(postContent, comments);
|
|
52
|
+
return {
|
|
53
|
+
content: contentHtml,
|
|
54
|
+
contentHtml: contentHtml,
|
|
55
|
+
extractedContent: {
|
|
56
|
+
type,
|
|
57
|
+
number,
|
|
58
|
+
repository: repoInfo.repo,
|
|
59
|
+
owner: repoInfo.owner,
|
|
60
|
+
},
|
|
61
|
+
variables: {
|
|
62
|
+
title: this.document.title,
|
|
63
|
+
author,
|
|
64
|
+
published,
|
|
65
|
+
site: `GitHub - ${repoInfo.owner}/${repoInfo.repo}`,
|
|
66
|
+
description: this.createDescription(contentHtml),
|
|
67
|
+
}
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
createContentHtml(postContent, comments) {
|
|
71
|
+
return (0, comments_1.buildContentHtml)('github', postContent, comments);
|
|
72
|
+
}
|
|
73
|
+
getIssueContent() {
|
|
74
|
+
const issueContainer = this.document.querySelector('[data-testid="issue-viewer-issue-container"]');
|
|
75
|
+
if (!issueContainer)
|
|
76
|
+
return { content: '', author: '', published: '' };
|
|
77
|
+
const author = this.extractAuthor(issueContainer, [
|
|
78
|
+
'a[data-testid="issue-body-header-author"]',
|
|
79
|
+
'.IssueBodyHeaderAuthor-module__authorLoginLink--_S7aT',
|
|
80
|
+
'.ActivityHeader-module__AuthorLink--iofTU',
|
|
81
|
+
'a[href*="/users/"][data-hovercard-url*="/users/"]',
|
|
82
|
+
'a[aria-label*="profile"]'
|
|
83
|
+
]);
|
|
84
|
+
const issueTimeElement = issueContainer.querySelector('relative-time');
|
|
85
|
+
const published = issueTimeElement?.getAttribute('datetime') || '';
|
|
86
|
+
const issueBodyElement = issueContainer.querySelector('[data-testid="issue-body-viewer"] .markdown-body');
|
|
87
|
+
if (!issueBodyElement)
|
|
88
|
+
return { content: '', author, published };
|
|
89
|
+
const content = this.cleanBodyContent(issueBodyElement);
|
|
90
|
+
return { content, author, published };
|
|
91
|
+
}
|
|
92
|
+
extractComments() {
|
|
93
|
+
const commentElements = Array.from(this.document.querySelectorAll('[data-wrapper-timeline-id]'));
|
|
94
|
+
const processedComments = new Set();
|
|
95
|
+
const commentData = [];
|
|
96
|
+
for (const commentElement of commentElements) {
|
|
97
|
+
const commentContainer = commentElement.querySelector('.react-issue-comment');
|
|
98
|
+
if (!commentContainer)
|
|
99
|
+
continue;
|
|
100
|
+
const commentId = commentElement.getAttribute('data-wrapper-timeline-id');
|
|
101
|
+
if (!commentId || processedComments.has(commentId))
|
|
102
|
+
continue;
|
|
103
|
+
processedComments.add(commentId);
|
|
104
|
+
const author = this.extractAuthor(commentContainer, [
|
|
105
|
+
'.ActivityHeader-module__AuthorLink--iofTU',
|
|
106
|
+
'a[data-testid="avatar-link"]',
|
|
107
|
+
'a[href^="/"][data-hovercard-url*="/users/"]'
|
|
108
|
+
]);
|
|
109
|
+
const timeElement = commentContainer.querySelector('relative-time');
|
|
110
|
+
const timestamp = timeElement?.getAttribute('datetime') || '';
|
|
111
|
+
const date = timestamp ? new Date(timestamp).toISOString().split('T')[0] : '';
|
|
112
|
+
const bodyElement = commentContainer.querySelector('.markdown-body');
|
|
113
|
+
if (!bodyElement)
|
|
114
|
+
continue;
|
|
115
|
+
const bodyContent = this.cleanBodyContent(bodyElement);
|
|
116
|
+
if (!bodyContent)
|
|
117
|
+
continue;
|
|
118
|
+
commentData.push({
|
|
119
|
+
author,
|
|
120
|
+
date,
|
|
121
|
+
content: bodyContent,
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
return (0, comments_1.buildCommentTree)(commentData);
|
|
125
|
+
}
|
|
126
|
+
getPRBody() {
|
|
127
|
+
// PR body is in [id^="pullrequest-"] or the first .timeline-comment
|
|
128
|
+
return this.document.querySelector('[id^="pullrequest-"]')
|
|
129
|
+
|| this.document.querySelector('.timeline-comment');
|
|
130
|
+
}
|
|
131
|
+
getPRContent(prBody) {
|
|
132
|
+
const bodyEl = prBody?.querySelector('.comment-body.markdown-body')
|
|
133
|
+
|| this.document.querySelector('.comment-body.markdown-body');
|
|
134
|
+
const content = bodyEl ? this.cleanBodyContent(bodyEl) : '';
|
|
135
|
+
const authorEl = prBody?.querySelector('.author')
|
|
136
|
+
|| this.document.querySelector('.gh-header-meta .author');
|
|
137
|
+
const author = authorEl?.textContent?.trim() || '';
|
|
138
|
+
const timeEl = prBody?.querySelector('relative-time');
|
|
139
|
+
const published = timeEl?.getAttribute('datetime') || '';
|
|
140
|
+
return { content, author, published };
|
|
141
|
+
}
|
|
142
|
+
extractPRComments(prBody) {
|
|
143
|
+
// Find all comment containers: regular comments (.timeline-comment)
|
|
144
|
+
// and code review comments (.review-comment)
|
|
145
|
+
const allComments = Array.from(this.document.querySelectorAll('.timeline-comment, .review-comment'));
|
|
146
|
+
const commentData = [];
|
|
147
|
+
for (const comment of allComments) {
|
|
148
|
+
// Skip the PR description
|
|
149
|
+
if (prBody && (comment === prBody || prBody.contains(comment)))
|
|
150
|
+
continue;
|
|
151
|
+
const authorEl = comment.querySelector('.author');
|
|
152
|
+
const author = authorEl?.textContent?.trim() || '';
|
|
153
|
+
const timeEl = comment.querySelector('relative-time');
|
|
154
|
+
const timestamp = timeEl?.getAttribute('datetime') || '';
|
|
155
|
+
const date = timestamp ? new Date(timestamp).toISOString().split('T')[0] : '';
|
|
156
|
+
const bodyEl = comment.querySelector('.comment-body.markdown-body');
|
|
157
|
+
if (!bodyEl)
|
|
158
|
+
continue;
|
|
159
|
+
const bodyContent = this.cleanBodyContent(bodyEl);
|
|
160
|
+
if (!bodyContent)
|
|
161
|
+
continue;
|
|
162
|
+
commentData.push({
|
|
163
|
+
author,
|
|
164
|
+
date,
|
|
165
|
+
content: bodyContent,
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
return (0, comments_1.buildCommentTree)(commentData);
|
|
169
|
+
}
|
|
170
|
+
extractAuthor(container, selectors) {
|
|
171
|
+
for (const selector of selectors) {
|
|
172
|
+
const authorLink = container.querySelector(selector);
|
|
173
|
+
if (authorLink) {
|
|
174
|
+
const href = authorLink.getAttribute('href');
|
|
175
|
+
if (href) {
|
|
176
|
+
if (href.startsWith('/')) {
|
|
177
|
+
return href.substring(1);
|
|
178
|
+
}
|
|
179
|
+
else if (href.includes('github.com/')) {
|
|
180
|
+
const match = href.match(/github\.com\/([^\/\?#]+)/);
|
|
181
|
+
if (match && match[1]) {
|
|
182
|
+
return match[1];
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return 'Unknown';
|
|
189
|
+
}
|
|
190
|
+
cleanBodyContent(bodyElement) {
|
|
191
|
+
const cleanBody = bodyElement.cloneNode(true);
|
|
192
|
+
cleanBody.querySelectorAll('button, [data-testid*="button"], [data-testid*="menu"]').forEach(el => el.remove());
|
|
193
|
+
cleanBody.querySelectorAll('.js-clipboard-copy, .zeroclipboard-container').forEach(el => el.remove());
|
|
194
|
+
// Convert GitHub's highlighted code blocks to standard <pre><code>
|
|
195
|
+
// GitHub uses <div class="highlight highlight-source-{lang}"><pre>spans...</pre></div>
|
|
196
|
+
// The <pre> has no <code> child, which breaks markdown conversion.
|
|
197
|
+
cleanBody.querySelectorAll('div.highlight[class*="highlight-source-"] pre, div.highlight pre').forEach(pre => {
|
|
198
|
+
const wrapper = pre.parentElement;
|
|
199
|
+
if (!wrapper)
|
|
200
|
+
return;
|
|
201
|
+
// Extract language from wrapper class (e.g. "highlight-source-ts")
|
|
202
|
+
const langMatch = wrapper.className.match(/highlight-source-(\w+)/);
|
|
203
|
+
const lang = langMatch?.[1] || '';
|
|
204
|
+
// Use data-snippet-clipboard-copy-content if available (clean text),
|
|
205
|
+
// otherwise fall back to textContent
|
|
206
|
+
const content = wrapper.getAttribute('data-snippet-clipboard-copy-content')
|
|
207
|
+
|| pre.textContent || '';
|
|
208
|
+
const code = this.document.createElement('code');
|
|
209
|
+
if (lang) {
|
|
210
|
+
code.setAttribute('class', `language-${lang}`);
|
|
211
|
+
code.setAttribute('data-lang', lang);
|
|
212
|
+
}
|
|
213
|
+
code.textContent = content;
|
|
214
|
+
const newPre = this.document.createElement('pre');
|
|
215
|
+
newPre.appendChild(code);
|
|
216
|
+
wrapper.replaceWith(newPre);
|
|
217
|
+
});
|
|
218
|
+
return (0, dom_1.serializeHTML)(cleanBody).trim();
|
|
219
|
+
}
|
|
220
|
+
extractNumber() {
|
|
221
|
+
// Try URL first (most reliable)
|
|
222
|
+
const urlMatch = this.url.match(/\/(issues|pull)\/(\d+)/);
|
|
223
|
+
if (urlMatch)
|
|
224
|
+
return urlMatch[2];
|
|
225
|
+
// Fallback to HTML extraction
|
|
226
|
+
const titleElement = this.document.querySelector('h1');
|
|
227
|
+
const titleMatch = titleElement?.textContent?.match(/#(\d+)/);
|
|
228
|
+
return titleMatch ? titleMatch[1] : '';
|
|
229
|
+
}
|
|
230
|
+
extractRepoInfo() {
|
|
231
|
+
// Try URL first (most reliable)
|
|
232
|
+
const urlMatch = this.url.match(/github\.com\/([^\/]+)\/([^\/]+)/);
|
|
233
|
+
if (urlMatch) {
|
|
234
|
+
return { owner: urlMatch[1], repo: urlMatch[2] };
|
|
235
|
+
}
|
|
236
|
+
// Fallback to HTML extraction
|
|
237
|
+
const titleMatch = this.document.title.match(/([^\/\s]+)\/([^\/\s]+)/);
|
|
238
|
+
return titleMatch ? { owner: titleMatch[1], repo: titleMatch[2] } : { owner: '', repo: '' };
|
|
239
|
+
}
|
|
240
|
+
createDescription(content) {
|
|
241
|
+
if (!content)
|
|
242
|
+
return '';
|
|
243
|
+
const tempDiv = this.document.createElement('div');
|
|
244
|
+
tempDiv.appendChild((0, dom_1.parseHTML)(this.document, content));
|
|
245
|
+
return tempDiv.textContent?.trim()
|
|
246
|
+
.slice(0, 140)
|
|
247
|
+
.replace(/\s+/g, ' ') || '';
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
exports.GitHubExtractor = GitHubExtractor;
|
|
251
|
+
//# sourceMappingURL=github.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github.js","sourceRoot":"","sources":["../../src/extractors/github.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,sCAAwD;AACxD,gDAAyF;AAEzF,MAAa,eAAgB,SAAQ,qBAAa;IAIjD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACrB,IAAI,CAAC,OAAO,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACzC,IAAI,CAAC,IAAI,GAAG,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACrC,CAAC;IAED,UAAU;QACT,MAAM,gBAAgB,GAAG;YACxB,sDAAsD;YACtD,6BAA6B;YAC7B,wCAAwC;YACxC,oBAAoB;YACpB,yBAAyB;SACzB,CAAC;QAEF,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,EAAE,CAAC;YACxF,OAAO,KAAK,CAAC;QACd,CAAC;QAED,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,OAAO;gBACN,uCAAuC;gBACvC,6BAA6B;aAC7B,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;QACpE,CAAC;QAED,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YACf,OAAO;gBACN,2BAA2B;gBAC3B,sBAAsB;gBACtB,kBAAkB;gBAClB,iBAAiB;aACjB,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;QACpE,CAAC;QAED,OAAO,KAAK,CAAC;IACd,CAAC;IAED,OAAO;QACN,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QACpC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC;QAE1C,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QACnD,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,IAAI,CAAC,IAAI;YAC5D,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC;YAC3B,CAAC,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,KAAK;YACrD,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;YACvE,CAAC,CAAC,EAAE,CAAC;QACN,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;QAElE,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,IAAI;gBACJ,MAAM;gBACN,UAAU,EAAE,QAAQ,CAAC,IAAI;gBACzB,KAAK,EAAE,QAAQ,CAAC,KAAK;aACrB;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,QAAQ,CAAC,KAAK;gBAC1B,MAAM;gBACN,SAAS;gBACT,IAAI,EAAE,YAAY,QAAQ,CAAC,KAAK,IAAI,QAAQ,CAAC,IAAI,EAAE;gBACnD,WAAW,EAAE,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC;aAChD;SACD,CAAC;IACH,CAAC;IAEO,iBAAiB,CAAC,WAAmB,EAAE,QAAgB;QAC9D,OAAO,IAAA,2BAAgB,EAAC,QAAQ,EAAE,WAAW,EAAE,QAAQ,CAAC,CAAC;IAC1D,CAAC;IAEO,eAAe;QACtB,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,8CAA8C,CAAC,CAAC;QACnG,IAAI,CAAC,cAAc;YAAE,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;QAEvE,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,cAAc,EAAE;YACjD,2CAA2C;YAC3C,uDAAuD;YACvD,2CAA2C;YAC3C,mDAAmD;YACnD,0BAA0B;SAC1B,CAAC,CAAC;QAEH,MAAM,gBAAgB,GAAG,cAAc,CAAC,aAAa,CAAC,eAAe,CAAC,CAAC;QACvE,MAAM,SAAS,GAAG,gBAAgB,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;QAEnE,MAAM,gBAAgB,GAAG,cAAc,CAAC,aAAa,CAAC,kDAAkD,CAAC,CAAC;QAC1G,IAAI,CAAC,gBAAgB;YAAE,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;QAEjE,MAAM,OAAO,GAAG,IAAI,CAAC,gBAAgB,CAAC,gBAAgB,CAAC,CAAC;QAExD,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;IACvC,CAAC;IAEO,eAAe;QACtB,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,4BAA4B,CAAC,CAAC,CAAC;QACjG,MAAM,iBAAiB,GAAG,IAAI,GAAG,EAAU,CAAC;QAC5C,MAAM,WAAW,GAAkB,EAAE,CAAC;QAEtC,KAAK,MAAM,cAAc,IAAI,eAAe,EAAE,CAAC;YAC9C,MAAM,gBAAgB,GAAG,cAAc,CAAC,aAAa,CAAC,sBAAsB,CAAC,CAAC;YAC9E,IAAI,CAAC,gBAAgB;gBAAE,SAAS;YAEhC,MAAM,SAAS,GAAG,cAAc,CAAC,YAAY,CAAC,0BAA0B,CAAC,CAAC;YAC1E,IAAI,CAAC,SAAS,IAAI,iBAAiB,CAAC,GAAG,CAAC,SAAS,CAAC;gBAAE,SAAS;YAC7D,iBAAiB,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;YAEjC,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,gBAAgB,EAAE;gBACnD,2CAA2C;gBAC3C,8BAA8B;gBAC9B,6CAA6C;aAC7C,CAAC,CAAC;YAEH,MAAM,WAAW,GAAG,gBAAgB,CAAC,aAAa,CAAC,eAAe,CAAC,CAAC;YACpE,MAAM,SAAS,GAAG,WAAW,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;YAC9D,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAE9E,MAAM,WAAW,GAAG,gBAAgB,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;YACrE,IAAI,CAAC,WAAW;gBAAE,SAAS;YAE3B,MAAM,WAAW,GAAG,IAAI,CAAC,gBAAgB,CAAC,WAAW,CAAC,CAAC;YACvD,IAAI,CAAC,WAAW;gBAAE,SAAS;YAE3B,WAAW,CAAC,IAAI,CAAC;gBAChB,MAAM;gBACN,IAAI;gBACJ,OAAO,EAAE,WAAW;aACpB,CAAC,CAAC;QACJ,CAAC;QAED,OAAO,IAAA,2BAAgB,EAAC,WAAW,CAAC,CAAC;IACtC,CAAC;IAEO,SAAS;QAChB,oEAAoE;QACpE,OAAO,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,sBAAsB,CAAC;eACtD,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,mBAAmB,CAAC,CAAC;IACtD,CAAC;IAEO,YAAY,CAAC,MAAsB;QAE1C,MAAM,MAAM,GAAG,MAAM,EAAE,aAAa,CAAC,6BAA6B,CAAC;eAC/D,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,6BAA6B,CAAC,CAAC;QAC/D,MAAM,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE5D,MAAM,QAAQ,GAAG,MAAM,EAAE,aAAa,CAAC,SAAS,CAAC;eAC7C,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,yBAAyB,CAAC,CAAC;QAC3D,MAAM,MAAM,GAAG,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAEnD,MAAM,MAAM,GAAG,MAAM,EAAE,aAAa,CAAC,eAAe,CAAC,CAAC;QACtD,MAAM,SAAS,GAAG,MAAM,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;QAEzD,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;IACvC,CAAC;IAEO,iBAAiB,CAAC,MAAsB;QAC/C,oEAAoE;QACpE,6CAA6C;QAC7C,MAAM,WAAW,GAAG,KAAK,CAAC,IAAI,CAC7B,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,oCAAoC,CAAC,CACpE,CAAC;QACF,MAAM,WAAW,GAAkB,EAAE,CAAC;QAEtC,KAAK,MAAM,OAAO,IAAI,WAAW,EAAE,CAAC;YACnC,0BAA0B;YAC1B,IAAI,MAAM,IAAI,CAAC,OAAO,KAAK,MAAM,IAAI,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;gBAAE,SAAS;YAEzE,MAAM,QAAQ,GAAG,OAAO,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;YAClD,MAAM,MAAM,GAAG,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAEnD,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,eAAe,CAAC,CAAC;YACtD,MAAM,SAAS,GAAG,MAAM,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;YACzD,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAE9E,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,6BAA6B,CAAC,CAAC;YACpE,IAAI,CAAC,MAAM;gBAAE,SAAS;YAEtB,MAAM,WAAW,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;YAClD,IAAI,CAAC,WAAW;gBAAE,SAAS;YAE3B,WAAW,CAAC,IAAI,CAAC;gBAChB,MAAM;gBACN,IAAI;gBACJ,OAAO,EAAE,WAAW;aACpB,CAAC,CAAC;QACJ,CAAC;QAED,OAAO,IAAA,2BAAgB,EAAC,WAAW,CAAC,CAAC;IACtC,CAAC;IAEO,aAAa,CAAC,SAAkB,EAAE,SAAmB;QAC5D,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;YAClC,MAAM,UAAU,GAAG,SAAS,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;YACrD,IAAI,UAAU,EAAE,CAAC;gBAChB,MAAM,IAAI,GAAG,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;gBAC7C,IAAI,IAAI,EAAE,CAAC;oBACV,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;wBAC1B,OAAO,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;oBAC1B,CAAC;yBAAM,IAAI,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;wBACzC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;wBACrD,IAAI,KAAK,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;4BACvB,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC;wBACjB,CAAC;oBACF,CAAC;gBACF,CAAC;YACF,CAAC;QACF,CAAC;QACD,OAAO,SAAS,CAAC;IAClB,CAAC;IAEO,gBAAgB,CAAC,WAAoB;QAC5C,MAAM,SAAS,GAAG,WAAW,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;QACzD,SAAS,CAAC,gBAAgB,CAAC,wDAAwD,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QAChH,SAAS,CAAC,gBAAgB,CAAC,8CAA8C,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QAEtG,mEAAmE;QACnE,uFAAuF;QACvF,mEAAmE;QACnE,SAAS,CAAC,gBAAgB,CAAC,kEAAkE,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;YAC5G,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC;YAClC,IAAI,CAAC,OAAO;gBAAE,OAAO;YAErB,mEAAmE;YACnE,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC;YACpE,MAAM,IAAI,GAAG,SAAS,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAElC,qEAAqE;YACrE,qCAAqC;YACrC,MAAM,OAAO,GAAG,OAAO,CAAC,YAAY,CAAC,qCAAqC,CAAC;mBACvE,GAAG,CAAC,WAAW,IAAI,EAAE,CAAC;YAE1B,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YACjD,IAAI,IAAI,EAAE,CAAC;gBACV,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,YAAY,IAAI,EAAE,CAAC,CAAC;gBAC/C,IAAI,CAAC,YAAY,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;YACtC,CAAC;YACD,IAAI,CAAC,WAAW,GAAG,OAAO,CAAC;YAE3B,MAAM,MAAM,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YAClD,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;YACzB,OAAO,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAC7B,CAAC,CAAC,CAAC;QAEH,OAAO,IAAA,mBAAa,EAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,CAAC;IAEO,aAAa;QACpB,gCAAgC;QAChC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC;QAC1D,IAAI,QAAQ;YAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC;QAEjC,8BAA8B;QAC9B,MAAM,YAAY,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;QACvD,MAAM,UAAU,GAAG,YAAY,EAAE,WAAW,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;QAC9D,OAAO,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACxC,CAAC;IAEO,eAAe;QACtB,gCAAgC;QAChC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACnE,IAAI,QAAQ,EAAE,CAAC;YACd,OAAO,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;QAClD,CAAC;QAED,8BAA8B;QAC9B,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC;QACvE,OAAO,UAAU,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IAC7F,CAAC;IAGO,iBAAiB,CAAC,OAAe;QACxC,IAAI,CAAC,OAAO;YAAE,OAAO,EAAE,CAAC;QAExB,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACnD,OAAO,CAAC,WAAW,CAAC,IAAA,eAAS,EAAC,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;QACvD,OAAO,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE;aAChC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;aACb,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC;IAC9B,CAAC;CACD;AA/RD,0CA+RC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { ConversationExtractor } from './_conversation';
|
|
2
|
+
import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
|
|
3
|
+
export declare class GrokExtractor extends ConversationExtractor {
|
|
4
|
+
private messageContainerSelector;
|
|
5
|
+
private messageBubbles;
|
|
6
|
+
private footnotes;
|
|
7
|
+
private footnoteCounter;
|
|
8
|
+
constructor(document: Document, url: string);
|
|
9
|
+
canExtract(): boolean;
|
|
10
|
+
protected extractMessages(): ConversationMessage[];
|
|
11
|
+
protected getFootnotes(): Footnote[];
|
|
12
|
+
protected getMetadata(): ConversationMetadata;
|
|
13
|
+
private getTitle;
|
|
14
|
+
private processFootnotes;
|
|
15
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.GrokExtractor = void 0;
|
|
4
|
+
const _conversation_1 = require("./_conversation");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
class GrokExtractor extends _conversation_1.ConversationExtractor {
|
|
7
|
+
constructor(document, url) {
|
|
8
|
+
super(document, url);
|
|
9
|
+
// Note: This selector relies heavily on CSS utility classes and may break if Grok's UI changes.
|
|
10
|
+
this.messageContainerSelector = '.relative.group.flex.flex-col.justify-center.w-full';
|
|
11
|
+
this.messageBubbles = document.querySelectorAll(this.messageContainerSelector);
|
|
12
|
+
this.footnotes = [];
|
|
13
|
+
this.footnoteCounter = 0;
|
|
14
|
+
}
|
|
15
|
+
canExtract() {
|
|
16
|
+
return !!this.messageBubbles && this.messageBubbles.length > 0;
|
|
17
|
+
}
|
|
18
|
+
extractMessages() {
|
|
19
|
+
const messages = [];
|
|
20
|
+
this.footnotes = [];
|
|
21
|
+
this.footnoteCounter = 0;
|
|
22
|
+
if (!this.messageBubbles || this.messageBubbles.length === 0)
|
|
23
|
+
return messages;
|
|
24
|
+
this.messageBubbles.forEach((container) => {
|
|
25
|
+
// Note: Relies on layout classes 'items-end' and 'items-start' which might change.
|
|
26
|
+
const isUserMessage = container.classList.contains('items-end');
|
|
27
|
+
const isGrokMessage = container.classList.contains('items-start');
|
|
28
|
+
if (!isUserMessage && !isGrokMessage)
|
|
29
|
+
return; // Skip elements that aren't clearly user or Grok messages
|
|
30
|
+
const messageBubble = container.querySelector('.message-bubble');
|
|
31
|
+
if (!messageBubble)
|
|
32
|
+
return; // Skip if the core message bubble isn't found
|
|
33
|
+
let content = '';
|
|
34
|
+
let role = '';
|
|
35
|
+
let author = '';
|
|
36
|
+
if (isUserMessage) {
|
|
37
|
+
// Assume user message bubble's textContent is the desired content.
|
|
38
|
+
// This is simpler and potentially less brittle than selecting specific spans.
|
|
39
|
+
content = messageBubble.textContent || '';
|
|
40
|
+
role = 'user';
|
|
41
|
+
author = 'You'; // Or potentially extract from an attribute if available later
|
|
42
|
+
}
|
|
43
|
+
else if (isGrokMessage) {
|
|
44
|
+
role = 'assistant';
|
|
45
|
+
author = 'Grok'; // Or potentially extract from an attribute if available later
|
|
46
|
+
// Clone the bubble to modify it without affecting the original page
|
|
47
|
+
const clonedBubble = messageBubble.cloneNode(true);
|
|
48
|
+
// Remove known non-content elements like the DeepSearch artifact
|
|
49
|
+
clonedBubble.querySelector('.relative.border.border-border-l1.bg-surface-base')?.remove();
|
|
50
|
+
// Add selectors here for any other known elements to remove (e.g., buttons, toolbars within the bubble)
|
|
51
|
+
content = (0, dom_1.serializeHTML)(clonedBubble);
|
|
52
|
+
// Process footnotes/links in the cleaned content
|
|
53
|
+
content = this.processFootnotes(content);
|
|
54
|
+
}
|
|
55
|
+
if (content.trim()) {
|
|
56
|
+
messages.push({
|
|
57
|
+
author: author,
|
|
58
|
+
content: content.trim(),
|
|
59
|
+
metadata: {
|
|
60
|
+
role: role
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
return messages;
|
|
66
|
+
}
|
|
67
|
+
getFootnotes() {
|
|
68
|
+
return this.footnotes;
|
|
69
|
+
}
|
|
70
|
+
getMetadata() {
|
|
71
|
+
const title = this.getTitle();
|
|
72
|
+
const messageCount = this.messageBubbles?.length || 0;
|
|
73
|
+
return {
|
|
74
|
+
title,
|
|
75
|
+
site: 'Grok',
|
|
76
|
+
url: this.url,
|
|
77
|
+
messageCount: messageCount, // Use estimated count
|
|
78
|
+
description: `Grok conversation with ${messageCount} messages`
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
getTitle() {
|
|
82
|
+
// Try to get the page title first (more reliable)
|
|
83
|
+
const pageTitle = this.document.title?.trim();
|
|
84
|
+
if (pageTitle && pageTitle !== 'Grok' && !pageTitle.startsWith('Grok by ')) {
|
|
85
|
+
// Remove ' - Grok' suffix if present
|
|
86
|
+
return pageTitle.replace(/\s-\s*Grok$/, '').trim();
|
|
87
|
+
}
|
|
88
|
+
// Fallback: Find the first user message bubble and use its text content
|
|
89
|
+
// Note: Still relies on 'items-end' class.
|
|
90
|
+
const firstUserContainer = this.document.querySelector(`${this.messageContainerSelector}.items-end`);
|
|
91
|
+
if (firstUserContainer) {
|
|
92
|
+
const messageBubble = firstUserContainer.querySelector('.message-bubble');
|
|
93
|
+
if (messageBubble) {
|
|
94
|
+
const text = messageBubble.textContent?.trim() || '';
|
|
95
|
+
// Truncate to first 50 characters if longer
|
|
96
|
+
return text.length > 50 ? text.slice(0, 50) + '...' : text;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return 'Grok Conversation'; // Default fallback
|
|
100
|
+
}
|
|
101
|
+
processFootnotes(content) {
|
|
102
|
+
// Regex to find <a> tags, capture href and link text
|
|
103
|
+
const linkPattern = /<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)<\/a>/gi; // Use 'g' and 'i' flags
|
|
104
|
+
return content.replace(linkPattern, (match, url, linkText) => {
|
|
105
|
+
// Skip processing for internal anchor links, empty URLs, or non-http(s) protocols
|
|
106
|
+
if (!url || url.startsWith('#') || !url.match(/^https?:\/\//i)) {
|
|
107
|
+
return match;
|
|
108
|
+
}
|
|
109
|
+
// Check if this URL already exists in our footnotes
|
|
110
|
+
let footnote = this.footnotes.find(fn => fn.url === url);
|
|
111
|
+
let footnoteIndex;
|
|
112
|
+
if (!footnote) {
|
|
113
|
+
// Create a new footnote if URL doesn't exist
|
|
114
|
+
this.footnoteCounter++;
|
|
115
|
+
footnoteIndex = this.footnoteCounter;
|
|
116
|
+
let domainText = url; // Default to full URL if parsing fails
|
|
117
|
+
try {
|
|
118
|
+
const domain = new URL(url).hostname.replace(/^www\./, '');
|
|
119
|
+
domainText = `<a href="${url}" target="_blank" rel="noopener noreferrer">${domain}</a>`;
|
|
120
|
+
}
|
|
121
|
+
catch (e) {
|
|
122
|
+
// Keep domainText as the original URL if parsing fails
|
|
123
|
+
domainText = `<a href="${url}" target="_blank" rel="noopener noreferrer">${url}</a>`;
|
|
124
|
+
console.warn(`GrokExtractor: Could not parse URL for footnote: ${url}`);
|
|
125
|
+
}
|
|
126
|
+
this.footnotes.push({
|
|
127
|
+
url,
|
|
128
|
+
text: domainText // Store the link HTML directly
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
// Find the 1-based index of the existing footnote
|
|
133
|
+
footnoteIndex = this.footnotes.findIndex(fn => fn.url === url) + 1;
|
|
134
|
+
}
|
|
135
|
+
// Return the original link text wrapped with a footnote reference
|
|
136
|
+
// Ensure the link text itself is not clickable again if it was part of the original match
|
|
137
|
+
return `${linkText}<sup id="fnref:${footnoteIndex}" class="footnote-ref"><a href="#fn:${footnoteIndex}" class="footnote-link">${footnoteIndex}</a></sup>`;
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
exports.GrokExtractor = GrokExtractor;
|
|
142
|
+
//# sourceMappingURL=grok.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"grok.js","sourceRoot":"","sources":["../../src/extractors/grok.ts"],"names":[],"mappings":";;;AAAA,mDAAwD;AAExD,sCAA6C;AAE7C,MAAa,aAAc,SAAQ,qCAAqB;IAOvD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAPtB,gGAAgG;QACxF,6BAAwB,GAAG,qDAAqD,CAAC;QAOxF,IAAI,CAAC,cAAc,GAAG,QAAQ,CAAC,gBAAgB,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QAC/E,IAAI,CAAC,SAAS,GAAG,EAAE,CAAC;QACpB,IAAI,CAAC,eAAe,GAAG,CAAC,CAAC;IAC1B,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,cAAc,IAAI,IAAI,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,CAAC;IAChE,CAAC;IAES,eAAe;QACxB,MAAM,QAAQ,GAA0B,EAAE,CAAC;QAC3C,IAAI,CAAC,SAAS,GAAG,EAAE,CAAC;QACpB,IAAI,CAAC,eAAe,GAAG,CAAC,CAAC;QAEzB,IAAI,CAAC,IAAI,CAAC,cAAc,IAAI,IAAI,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,QAAQ,CAAC;QAE9E,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC,SAAS,EAAE,EAAE;YACzC,mFAAmF;YACnF,MAAM,aAAa,GAAG,SAAS,CAAC,SAAS,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC;YAChE,MAAM,aAAa,GAAG,SAAS,CAAC,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC;YAElE,IAAI,CAAC,aAAa,IAAI,CAAC,aAAa;gBAAE,OAAO,CAAC,0DAA0D;YAExG,MAAM,aAAa,GAAG,SAAS,CAAC,aAAa,CAAC,iBAAiB,CAAC,CAAC;YACjE,IAAI,CAAC,aAAa;gBAAE,OAAO,CAAC,8CAA8C;YAE1E,IAAI,OAAO,GAAW,EAAE,CAAC;YACzB,IAAI,IAAI,GAAW,EAAE,CAAC;YACtB,IAAI,MAAM,GAAW,EAAE,CAAC;YAExB,IAAI,aAAa,EAAE,CAAC;gBACnB,mEAAmE;gBACnE,8EAA8E;gBAC9E,OAAO,GAAG,aAAa,CAAC,WAAW,IAAI,EAAE,CAAC;gBAC1C,IAAI,GAAG,MAAM,CAAC;gBACd,MAAM,GAAG,KAAK,CAAC,CAAC,8DAA8D;YAC/E,CAAC;iBAAM,IAAI,aAAa,EAAE,CAAC;gBAC1B,IAAI,GAAG,WAAW,CAAC;gBACnB,MAAM,GAAG,MAAM,CAAC,CAAC,8DAA8D;gBAE/E,oEAAoE;gBACpE,MAAM,YAAY,GAAG,aAAa,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;gBAE9D,iEAAiE;gBACjE,YAAY,CAAC,aAAa,CAAC,mDAAmD,CAAC,EAAE,MAAM,EAAE,CAAC;gBAC1F,wGAAwG;gBAExG,OAAO,GAAG,IAAA,mBAAa,EAAC,YAAY,CAAC,CAAC;gBAEtC,iDAAiD;gBACjD,OAAO,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YAC1C,CAAC;YAED,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;gBACpB,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,MAAM;oBACd,OAAO,EAAE,OAAO,CAAC,IAAI,EAAE;oBACvB,QAAQ,EAAE;wBACT,IAAI,EAAE,IAAI;qBACV;iBACD,CAAC,CAAC;YACJ,CAAC;QACF,CAAC,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC;IACjB,CAAC;IAES,YAAY;QACrB,OAAO,IAAI,CAAC,SAAS,CAAC;IACvB,CAAC;IAES,WAAW;QACpB,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,YAAY,GAAG,IAAI,CAAC,cAAc,EAAE,MAAM,IAAI,CAAC,CAAC;QAEtD,OAAO;YACN,KAAK;YACL,IAAI,EAAE,MAAM;YACZ,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,YAAY,EAAE,YAAY,EAAE,sBAAsB;YAClD,WAAW,EAAE,0BAA0B,YAAY,WAAW;SAC9D,CAAC;IACH,CAAC;IAEO,QAAQ;QACf,kDAAkD;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC;QAC9C,IAAI,SAAS,IAAI,SAAS,KAAK,MAAM,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5E,qCAAqC;YACrC,OAAO,SAAS,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACpD,CAAC;QAED,wEAAwE;QACxE,2CAA2C;QAC3C,MAAM,kBAAkB,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,GAAG,IAAI,CAAC,wBAAwB,YAAY,CAAC,CAAC;QACrG,IAAI,kBAAkB,EAAE,CAAC;YACxB,MAAM,aAAa,GAAG,kBAAkB,CAAC,aAAa,CAAC,iBAAiB,CAAC,CAAC;YAC1E,IAAI,aAAa,EAAE,CAAC;gBACnB,MAAM,IAAI,GAAG,aAAa,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;gBACrD,4CAA4C;gBAC5C,OAAO,IAAI,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;YAC5D,CAAC;QACF,CAAC;QAED,OAAO,mBAAmB,CAAC,CAAC,mBAAmB;IAChD,CAAC;IAEO,gBAAgB,CAAC,OAAe;QACvC,qDAAqD;QACrD,MAAM,WAAW,GAAG,qDAAqD,CAAC,CAAC,wBAAwB;QAEnG,OAAO,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE,QAAQ,EAAE,EAAE;YAC3D,kFAAkF;YACnF,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,eAAe,CAAC,EAAE,CAAC;gBAChE,OAAO,KAAK,CAAC;YACd,CAAC;YAED,oDAAoD;YACpD,IAAI,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC;YACzD,IAAI,aAAqB,CAAC;YAE1B,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACf,6CAA6C;gBAC7C,IAAI,CAAC,eAAe,EAAE,CAAC;gBACvB,aAAa,GAAG,IAAI,CAAC,eAAe,CAAC;gBAErC,IAAI,UAAU,GAAG,GAAG,CAAC,CAAC,uCAAuC;gBAC7D,IAAI,CAAC;oBACJ,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;oBAC3D,UAAU,GAAG,YAAY,GAAG,+CAA+C,MAAM,MAAM,CAAC;gBACzF,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACZ,uDAAuD;oBACvD,UAAU,GAAG,YAAY,GAAG,+CAA+C,GAAG,MAAM,CAAC;oBACrF,OAAO,CAAC,IAAI,CAAC,oDAAoD,GAAG,EAAE,CAAC,CAAC;gBACzE,CAAC;gBAED,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;oBACnB,GAAG;oBACH,IAAI,EAAE,UAAU,CAAC,+BAA+B;iBAChD,CAAC,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACP,kDAAkD;gBAClD,aAAa,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC;YACpE,CAAC;YAED,kEAAkE;YAClE,0FAA0F;YAC1F,OAAO,GAAG,QAAQ,kBAAkB,aAAa,uCAAuC,aAAa,2BAA2B,aAAa,YAAY,CAAC;QAC3J,CAAC,CAAC,CAAC;IACJ,CAAC;CACD;AA/JD,sCA+JC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { BaseExtractor } from './_base';
|
|
2
|
+
import { ExtractorResult } from '../types/extractors';
|
|
3
|
+
export declare class HackerNewsExtractor extends BaseExtractor {
|
|
4
|
+
private mainPost;
|
|
5
|
+
private isCommentPage;
|
|
6
|
+
private mainComment;
|
|
7
|
+
constructor(document: Document, url: string);
|
|
8
|
+
private detectCommentPage;
|
|
9
|
+
private findMainComment;
|
|
10
|
+
canExtract(): boolean;
|
|
11
|
+
extract(): ExtractorResult;
|
|
12
|
+
private createContentHtml;
|
|
13
|
+
private getPostContent;
|
|
14
|
+
private extractComments;
|
|
15
|
+
private processComments;
|
|
16
|
+
private getPostId;
|
|
17
|
+
private getPostTitle;
|
|
18
|
+
private getPostAuthor;
|
|
19
|
+
private createDescription;
|
|
20
|
+
private getPostDate;
|
|
21
|
+
}
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.HackerNewsExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
const comments_1 = require("../utils/comments");
|
|
7
|
+
class HackerNewsExtractor extends _base_1.BaseExtractor {
|
|
8
|
+
constructor(document, url) {
|
|
9
|
+
super(document, url);
|
|
10
|
+
this.mainPost = document.querySelector('.fatitem');
|
|
11
|
+
this.isCommentPage = this.detectCommentPage();
|
|
12
|
+
this.mainComment = this.isCommentPage ? this.findMainComment() : null;
|
|
13
|
+
}
|
|
14
|
+
detectCommentPage() {
|
|
15
|
+
// Comment pages have an "on: <story title>" link but no story title row
|
|
16
|
+
return !!this.mainPost?.querySelector('.onstory') && !this.mainPost?.querySelector('.titleline');
|
|
17
|
+
}
|
|
18
|
+
findMainComment() {
|
|
19
|
+
// Use the tr.athing row which contains both the comment metadata (.comhead)
|
|
20
|
+
// and the comment text (.commtext). The .comment div alone doesn't include
|
|
21
|
+
// the author (.hnuser) or timestamp (.age) which are in the sibling .comhead.
|
|
22
|
+
return this.mainPost?.querySelector('tr.athing') || null;
|
|
23
|
+
}
|
|
24
|
+
canExtract() {
|
|
25
|
+
return !!this.mainPost;
|
|
26
|
+
}
|
|
27
|
+
extract() {
|
|
28
|
+
const postContent = this.getPostContent();
|
|
29
|
+
const comments = this.options.includeReplies !== false ? this.extractComments() : '';
|
|
30
|
+
const contentHtml = this.createContentHtml(postContent, comments);
|
|
31
|
+
const postTitle = this.getPostTitle();
|
|
32
|
+
const postAuthor = this.getPostAuthor();
|
|
33
|
+
const description = this.createDescription();
|
|
34
|
+
const published = this.getPostDate();
|
|
35
|
+
return {
|
|
36
|
+
content: contentHtml,
|
|
37
|
+
contentHtml: contentHtml,
|
|
38
|
+
extractedContent: {
|
|
39
|
+
postId: this.getPostId(),
|
|
40
|
+
postAuthor,
|
|
41
|
+
},
|
|
42
|
+
variables: {
|
|
43
|
+
title: postTitle,
|
|
44
|
+
author: postAuthor,
|
|
45
|
+
site: 'Hacker News',
|
|
46
|
+
description,
|
|
47
|
+
published,
|
|
48
|
+
}
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
createContentHtml(postContent, comments) {
|
|
52
|
+
return (0, comments_1.buildContentHtml)('hackernews', postContent, comments);
|
|
53
|
+
}
|
|
54
|
+
getPostContent() {
|
|
55
|
+
if (!this.mainPost)
|
|
56
|
+
return '';
|
|
57
|
+
// If this is a comment page, use the comment as the main content
|
|
58
|
+
if (this.isCommentPage && this.mainComment) {
|
|
59
|
+
const author = this.mainComment.querySelector('.hnuser')?.textContent || '[deleted]';
|
|
60
|
+
const commtext = this.mainComment.querySelector('.commtext');
|
|
61
|
+
const commentText = commtext ? (0, dom_1.serializeHTML)(commtext) : '';
|
|
62
|
+
const timeElement = this.mainComment.querySelector('.age');
|
|
63
|
+
const timestamp = timeElement?.getAttribute('title') || '';
|
|
64
|
+
const date = timestamp.split('T')[0] || '';
|
|
65
|
+
const points = this.mainComment.querySelector('.score')?.textContent?.trim() || '';
|
|
66
|
+
return (0, comments_1.buildComment)({
|
|
67
|
+
author,
|
|
68
|
+
date,
|
|
69
|
+
content: commentText,
|
|
70
|
+
score: points || undefined,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
// Otherwise handle regular post content
|
|
74
|
+
const titleRow = this.mainPost.querySelector('tr.athing');
|
|
75
|
+
const subRow = titleRow?.nextElementSibling;
|
|
76
|
+
const url = titleRow?.querySelector('.titleline a')?.getAttribute('href') || '';
|
|
77
|
+
let content = '';
|
|
78
|
+
if (url) {
|
|
79
|
+
content += `<p><a href="${url}" target="_blank">${url}</a></p>`;
|
|
80
|
+
}
|
|
81
|
+
const text = this.mainPost.querySelector('.toptext');
|
|
82
|
+
if (text) {
|
|
83
|
+
content += `<div class="post-text">${(0, dom_1.serializeHTML)(text)}</div>`;
|
|
84
|
+
}
|
|
85
|
+
return content;
|
|
86
|
+
}
|
|
87
|
+
extractComments() {
|
|
88
|
+
const comments = Array.from(this.document.querySelectorAll('tr.comtr'));
|
|
89
|
+
return this.processComments(comments);
|
|
90
|
+
}
|
|
91
|
+
processComments(comments) {
|
|
92
|
+
const commentData = [];
|
|
93
|
+
const processedIds = new Set();
|
|
94
|
+
for (const comment of comments) {
|
|
95
|
+
const id = comment.getAttribute('id');
|
|
96
|
+
if (!id || processedIds.has(id))
|
|
97
|
+
continue;
|
|
98
|
+
processedIds.add(id);
|
|
99
|
+
const indent = comment.querySelector('.ind img')?.getAttribute('width') || '0';
|
|
100
|
+
const depth = parseInt(indent) / 40;
|
|
101
|
+
const commentText = comment.querySelector('.commtext');
|
|
102
|
+
const author = comment.querySelector('.hnuser')?.textContent || '[deleted]';
|
|
103
|
+
const timeElement = comment.querySelector('.age');
|
|
104
|
+
const points = comment.querySelector('.score')?.textContent?.trim() || '';
|
|
105
|
+
if (!commentText)
|
|
106
|
+
continue;
|
|
107
|
+
const commentUrl = `https://news.ycombinator.com/item?id=${id}`;
|
|
108
|
+
const timestamp = timeElement?.getAttribute('title') || '';
|
|
109
|
+
const date = timestamp.split('T')[0] || '';
|
|
110
|
+
commentData.push({
|
|
111
|
+
author,
|
|
112
|
+
date,
|
|
113
|
+
content: (0, dom_1.serializeHTML)(commentText),
|
|
114
|
+
depth,
|
|
115
|
+
score: points || undefined,
|
|
116
|
+
url: commentUrl,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
return (0, comments_1.buildCommentTree)(commentData);
|
|
120
|
+
}
|
|
121
|
+
getPostId() {
|
|
122
|
+
const match = this.url.match(/id=(\d+)/);
|
|
123
|
+
return match?.[1] || '';
|
|
124
|
+
}
|
|
125
|
+
getPostTitle() {
|
|
126
|
+
if (this.isCommentPage && this.mainComment) {
|
|
127
|
+
const author = this.mainComment.querySelector('.hnuser')?.textContent || '[deleted]';
|
|
128
|
+
const commentText = this.mainComment.querySelector('.commtext')?.textContent || '';
|
|
129
|
+
// Use first 50 characters of comment as title
|
|
130
|
+
const preview = commentText.trim().slice(0, 50) + (commentText.length > 50 ? '...' : '');
|
|
131
|
+
return `Comment by ${author}: ${preview}`;
|
|
132
|
+
}
|
|
133
|
+
return this.mainPost?.querySelector('.titleline')?.textContent?.trim() || '';
|
|
134
|
+
}
|
|
135
|
+
getPostAuthor() {
|
|
136
|
+
return this.mainPost?.querySelector('.hnuser')?.textContent?.trim() || '';
|
|
137
|
+
}
|
|
138
|
+
createDescription() {
|
|
139
|
+
const title = this.getPostTitle();
|
|
140
|
+
const author = this.getPostAuthor();
|
|
141
|
+
if (this.isCommentPage) {
|
|
142
|
+
return `Comment by ${author} on Hacker News`;
|
|
143
|
+
}
|
|
144
|
+
return `${title} - by ${author} on Hacker News`;
|
|
145
|
+
}
|
|
146
|
+
getPostDate() {
|
|
147
|
+
if (!this.mainPost)
|
|
148
|
+
return '';
|
|
149
|
+
const timeElement = this.mainPost.querySelector('.age');
|
|
150
|
+
const timestamp = timeElement?.getAttribute('title') || '';
|
|
151
|
+
return timestamp.split('T')[0] || '';
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
exports.HackerNewsExtractor = HackerNewsExtractor;
|
|
155
|
+
//# sourceMappingURL=hackernews.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hackernews.js","sourceRoot":"","sources":["../../src/extractors/hackernews.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,sCAA6C;AAC7C,gDAAkG;AAElG,MAAa,mBAAoB,SAAQ,qBAAa;IAKrD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACrB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC;QACnD,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAC9C,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;IACvE,CAAC;IAEO,iBAAiB;QACxB,wEAAwE;QACxE,OAAO,CAAC,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,YAAY,CAAC,CAAC;IAClG,CAAC;IAEO,eAAe;QACtB,4EAA4E;QAC5E,2EAA2E;QAC3E,8EAA8E;QAC9E,OAAO,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC;IAC1D,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;IACxB,CAAC;IAED,OAAO;QACN,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAErF,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;QAClE,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QACxC,MAAM,WAAW,GAAG,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAC7C,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;QAErC,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,MAAM,EAAE,IAAI,CAAC,SAAS,EAAE;gBACxB,UAAU;aACV;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,SAAS;gBAChB,MAAM,EAAE,UAAU;gBAClB,IAAI,EAAE,aAAa;gBACnB,WAAW;gBACX,SAAS;aACT;SACD,CAAC;IACH,CAAC;IAEO,iBAAiB,CAAC,WAAmB,EAAE,QAAgB;QAC9D,OAAO,IAAA,2BAAgB,EAAC,YAAY,EAAE,WAAW,EAAE,QAAQ,CAAC,CAAC;IAC9D,CAAC;IAEO,cAAc;QACrB,IAAI,CAAC,IAAI,CAAC,QAAQ;YAAE,OAAO,EAAE,CAAC;QAE9B,iEAAiE;QACjE,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,IAAI,WAAW,CAAC;YACrF,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,WAAW,CAAC,CAAC;YAC7D,MAAM,WAAW,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAA,mBAAa,EAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5D,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC3D,MAAM,SAAS,GAAG,WAAW,EAAE,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YAC3D,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC3C,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAEnF,OAAO,IAAA,uBAAY,EAAC;gBACnB,MAAM;gBACN,IAAI;gBACJ,OAAO,EAAE,WAAW;gBACpB,KAAK,EAAE,MAAM,IAAI,SAAS;aAC1B,CAAC,CAAC;QACJ,CAAC;QAED,wCAAwC;QACxC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,WAAW,CAAC,CAAC;QAC1D,MAAM,MAAM,GAAG,QAAQ,EAAE,kBAAkB,CAAC;QAC5C,MAAM,GAAG,GAAG,QAAQ,EAAE,aAAa,CAAC,cAAc,CAAC,EAAE,YAAY,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QAEhF,IAAI,OAAO,GAAG,EAAE,CAAC;QACjB,IAAI,GAAG,EAAE,CAAC;YACT,OAAO,IAAI,eAAe,GAAG,qBAAqB,GAAG,UAAU,CAAC;QACjE,CAAC;QAED,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC;QACrD,IAAI,IAAI,EAAE,CAAC;YACV,OAAO,IAAI,0BAA0B,IAAA,mBAAa,EAAC,IAAI,CAAC,QAAQ,CAAC;QAClE,CAAC;QAED,OAAO,OAAO,CAAC;IAChB,CAAC;IAEO,eAAe;QACtB,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC;QACxE,OAAO,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC;IAEO,eAAe,CAAC,QAAmB;QAC1C,MAAM,WAAW,GAAkB,EAAE,CAAC;QACtC,MAAM,YAAY,GAAG,IAAI,GAAG,EAAU,CAAC;QAEvC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,MAAM,EAAE,GAAG,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;YACtC,IAAI,CAAC,EAAE,IAAI,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC;gBAAE,SAAS;YAC1C,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAErB,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,UAAU,CAAC,EAAE,YAAY,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC/E,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC;YACpC,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,WAAW,CAAC,CAAC;YACvD,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,IAAI,WAAW,CAAC;YAC5E,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAClD,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAE1E,IAAI,CAAC,WAAW;gBAAE,SAAS;YAE3B,MAAM,UAAU,GAAG,wCAAwC,EAAE,EAAE,CAAC;YAChE,MAAM,SAAS,GAAG,WAAW,EAAE,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YAC3D,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAE3C,WAAW,CAAC,IAAI,CAAC;gBAChB,MAAM;gBACN,IAAI;gBACJ,OAAO,EAAE,IAAA,mBAAa,EAAC,WAAW,CAAC;gBACnC,KAAK;gBACL,KAAK,EAAE,MAAM,IAAI,SAAS;gBAC1B,GAAG,EAAE,UAAU;aACf,CAAC,CAAC;QACJ,CAAC;QAED,OAAO,IAAA,2BAAgB,EAAC,WAAW,CAAC,CAAC;IACtC,CAAC;IAEO,SAAS;QAChB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;QACzC,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACzB,CAAC;IAEO,YAAY;QACnB,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,IAAI,WAAW,CAAC;YACrF,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,WAAW,CAAC,EAAE,WAAW,IAAI,EAAE,CAAC;YACnF,8CAA8C;YAC9C,MAAM,OAAO,GAAG,WAAW,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YACzF,OAAO,cAAc,MAAM,KAAK,OAAO,EAAE,CAAC;QAC3C,CAAC;QACD,OAAO,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,YAAY,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC9E,CAAC;IAEO,aAAa;QACpB,OAAO,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC3E,CAAC;IAEO,iBAAiB;QACxB,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QACpC,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACxB,OAAO,cAAc,MAAM,iBAAiB,CAAC;QAC9C,CAAC;QACD,OAAO,GAAG,KAAK,SAAS,MAAM,iBAAiB,CAAC;IACjD,CAAC;IAEO,WAAW;QAClB,IAAI,CAAC,IAAI,CAAC,QAAQ;YAAE,OAAO,EAAE,CAAC;QAE9B,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QACxD,MAAM,SAAS,GAAG,WAAW,EAAE,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QAC3D,OAAO,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACtC,CAAC;CACD;AA9KD,kDA8KC"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { BaseExtractor } from './_base';
|
|
2
|
+
import { ExtractorResult } from '../types/extractors';
|
|
3
|
+
export declare class RedditExtractor extends BaseExtractor {
|
|
4
|
+
private shredditPost;
|
|
5
|
+
private isOldReddit;
|
|
6
|
+
constructor(document: Document, url: string);
|
|
7
|
+
canExtract(): boolean;
|
|
8
|
+
canExtractAsync(): boolean;
|
|
9
|
+
private isCommentsPage;
|
|
10
|
+
extractAsync(): Promise<ExtractorResult>;
|
|
11
|
+
extract(): ExtractorResult;
|
|
12
|
+
private extractOldReddit;
|
|
13
|
+
private getPostContent;
|
|
14
|
+
private createContentHtml;
|
|
15
|
+
private extractComments;
|
|
16
|
+
private getPostId;
|
|
17
|
+
private getSubreddit;
|
|
18
|
+
private getPostAuthor;
|
|
19
|
+
private createDescription;
|
|
20
|
+
private collectOldRedditComments;
|
|
21
|
+
private processComments;
|
|
22
|
+
}
|