@mz1999/defuddle 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +371 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +145 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +24 -0
- package/dist/constants.js +950 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +136 -0
- package/dist/defuddle.js +1816 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.d.ts +5 -0
- package/dist/elements/code.js +346 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/footnotes.d.ts +5 -0
- package/dist/elements/footnotes.js +619 -0
- package/dist/elements/footnotes.js.map +1 -0
- package/dist/elements/headings.d.ts +11 -0
- package/dist/elements/headings.js +100 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +877 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.base.d.ts +9 -0
- package/dist/elements/math.base.js +195 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.core.d.ts +7 -0
- package/dist/elements/math.core.js +52 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.d.ts +2 -0
- package/dist/elements/math.full.d.ts +8 -0
- package/dist/elements/math.js +7 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/extractor-registry.d.ts +16 -0
- package/dist/extractor-registry.js +140 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +22 -0
- package/dist/extractors/_base.js +27 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +78 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +14 -0
- package/dist/extractors/chatgpt.js +138 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +91 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/gemini.d.ts +14 -0
- package/dist/extractors/gemini.js +111 -0
- package/dist/extractors/gemini.js.map +1 -0
- package/dist/extractors/github.d.ts +20 -0
- package/dist/extractors/github.js +251 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +142 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +155 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +22 -0
- package/dist/extractors/reddit.js +197 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +204 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/x-article.d.ts +24 -0
- package/dist/extractors/x-article.js +267 -0
- package/dist/extractors/x-article.js.map +1 -0
- package/dist/extractors/x-oembed.d.ts +20 -0
- package/dist/extractors/x-oembed.js +350 -0
- package/dist/extractors/x-oembed.js.map +1 -0
- package/dist/extractors/youtube.d.ts +87 -0
- package/dist/extractors/youtube.js +869 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/fetch.d.ts +18 -0
- package/dist/fetch.js +265 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.full.d.ts +12 -0
- package/dist/index.full.js +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +30 -0
- package/dist/markdown.js +661 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.d.ts +25 -0
- package/dist/metadata.js +426 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +19 -0
- package/dist/node.js +78 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +31 -0
- package/dist/scoring.js +472 -0
- package/dist/scoring.js.map +1 -0
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +1101 -0
- package/dist/standardize.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +135 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +42 -0
- package/dist/utils/dom.js +104 -0
- package/dist/utils/dom.js.map +1 -0
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +98 -0
- package/dist/utils.js.map +1 -0
- package/package.json +107 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ConversationExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const defuddle_1 = require("../defuddle");
|
|
6
|
+
const dom_1 = require("../utils/dom");
|
|
7
|
+
class ConversationExtractor extends _base_1.BaseExtractor {
|
|
8
|
+
getFootnotes() {
|
|
9
|
+
return [];
|
|
10
|
+
}
|
|
11
|
+
extract() {
|
|
12
|
+
const messages = this.extractMessages();
|
|
13
|
+
const metadata = this.getMetadata();
|
|
14
|
+
const footnotes = this.getFootnotes();
|
|
15
|
+
const rawContentHtml = this.createContentHtml(messages, footnotes);
|
|
16
|
+
// Create a temporary document to run Defuddle on our content
|
|
17
|
+
const tempDoc = this.document.implementation.createHTMLDocument();
|
|
18
|
+
const container = tempDoc.createElement('article');
|
|
19
|
+
container.appendChild((0, dom_1.parseHTML)(tempDoc, rawContentHtml));
|
|
20
|
+
tempDoc.body.appendChild(container);
|
|
21
|
+
// Run Defuddle on our formatted content
|
|
22
|
+
const defuddled = new defuddle_1.Defuddle(tempDoc).parse();
|
|
23
|
+
const contentHtml = defuddled.content;
|
|
24
|
+
return {
|
|
25
|
+
content: contentHtml,
|
|
26
|
+
contentHtml: contentHtml,
|
|
27
|
+
extractedContent: {
|
|
28
|
+
messageCount: messages.length.toString(),
|
|
29
|
+
},
|
|
30
|
+
variables: {
|
|
31
|
+
title: metadata.title || 'Conversation',
|
|
32
|
+
site: metadata.site,
|
|
33
|
+
description: metadata.description || `${metadata.site} conversation with ${messages.length} messages`,
|
|
34
|
+
wordCount: defuddled.wordCount?.toString() || '',
|
|
35
|
+
}
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
createContentHtml(messages, footnotes) {
|
|
39
|
+
const messagesHtml = messages.map((message, index) => {
|
|
40
|
+
const timestampHtml = message.timestamp ?
|
|
41
|
+
`<div class="message-timestamp">${message.timestamp}</div>` : '';
|
|
42
|
+
// Check if content already has paragraph tags
|
|
43
|
+
const hasParagraphs = /<p[^>]*>[\s\S]*?<\/p>/i.test(message.content);
|
|
44
|
+
const contentHtml = hasParagraphs ? message.content : `<p>${message.content}</p>`;
|
|
45
|
+
// Add metadata to data attributes
|
|
46
|
+
const dataAttributes = message.metadata ?
|
|
47
|
+
Object.entries(message.metadata)
|
|
48
|
+
.map(([key, value]) => `data-${key}="${value}"`)
|
|
49
|
+
.join(' ') : '';
|
|
50
|
+
return `
|
|
51
|
+
<div class="message message-${message.author.toLowerCase()}" ${dataAttributes}>
|
|
52
|
+
<div class="message-header">
|
|
53
|
+
<p class="message-author"><strong>${message.author}</strong></p>
|
|
54
|
+
${timestampHtml}
|
|
55
|
+
</div>
|
|
56
|
+
<div class="message-content">
|
|
57
|
+
${contentHtml}
|
|
58
|
+
</div>
|
|
59
|
+
</div>${index < messages.length - 1 ? '\n<hr>' : ''}`;
|
|
60
|
+
}).join('\n').trim();
|
|
61
|
+
// Add footnotes section if we have any
|
|
62
|
+
const footnotesHtml = footnotes.length > 0 ? `
|
|
63
|
+
<div id="footnotes">
|
|
64
|
+
<ol>
|
|
65
|
+
${footnotes.map((footnote, index) => `
|
|
66
|
+
<li class="footnote" id="fn:${index + 1}">
|
|
67
|
+
<p>
|
|
68
|
+
<a href="${footnote.url}" target="_blank">${footnote.text}</a> <a href="#fnref:${index + 1}" class="footnote-backref">↩</a>
|
|
69
|
+
</p>
|
|
70
|
+
</li>
|
|
71
|
+
`).join('')}
|
|
72
|
+
</ol>
|
|
73
|
+
</div>` : '';
|
|
74
|
+
return `${messagesHtml}\n${footnotesHtml}`.trim();
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
exports.ConversationExtractor = ConversationExtractor;
|
|
78
|
+
//# sourceMappingURL=_conversation.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"_conversation.js","sourceRoot":"","sources":["../../src/extractors/_conversation.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,0CAAuC;AACvC,sCAAyC;AAEzC,MAAsB,qBAAsB,SAAQ,qBAAa;IAGtD,YAAY;QACrB,OAAO,EAAE,CAAC;IACX,CAAC;IAED,OAAO;QACN,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,EAAE,CAAC;QACxC,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,cAAc,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAEnE,6DAA6D;QAC7D,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,kBAAkB,EAAE,CAAC;QAClE,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;QACnD,SAAS,CAAC,WAAW,CAAC,IAAA,eAAS,EAAC,OAAO,EAAE,cAAc,CAAC,CAAC,CAAC;QAC1D,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;QAEpC,wCAAwC;QACxC,MAAM,SAAS,GAAG,IAAI,mBAAQ,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,CAAC;QAChD,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC;QAEtC,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,QAAQ,EAAE;aACxC;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,QAAQ,CAAC,KAAK,IAAI,cAAc;gBACvC,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,WAAW,EAAE,QAAQ,CAAC,WAAW,IAAI,GAAG,QAAQ,CAAC,IAAI,sBAAsB,QAAQ,CAAC,MAAM,WAAW;gBACrG,SAAS,EAAE,SAAS,CAAC,SAAS,EAAE,QAAQ,EAAE,IAAI,EAAE;aAChD;SACD,CAAC;IACH,CAAC;IAES,iBAAiB,CAAC,QAA+B,EAAE,SAAqB;QACjF,MAAM,YAAY,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE;YACpD,MAAM,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;gBACxC,kCAAkC,OAAO,CAAC,SAAS,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;YAElE,8CAA8C;YAC9C,MAAM,aAAa,GAAG,wBAAwB,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YACrE,MAAM,WAAW,GAAG,aAAa,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,CAAC,OAAO,MAAM,CAAC;YAElF,kCAAkC;YAClC,MAAM,cAAc,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;gBACxC,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC;qBAC9B,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,QAAQ,GAAG,KAAK,KAAK,GAAG,CAAC;qBAC/C,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAElB,OAAO;iCACuB,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,KAAK,cAAc;;yCAEvC,OAAO,CAAC,MAAM;OAChD,aAAa;;;OAGb,WAAW;;WAEP,KAAK,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;QACvD,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAErB,uCAAuC;QACvC,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;;;OAGxC,SAAS,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,EAAE,CAAC;oCACN,KAAK,GAAG,CAAC;;mBAE1B,QAAQ,CAAC,GAAG,qBAAqB,QAAQ,CAAC,IAAI,6BAA6B,KAAK,GAAG,CAAC;;;MAGjG,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;;UAEN,CAAC,CAAC,CAAC,EAAE,CAAC;QAEd,OAAO,GAAG,YAAY,KAAK,aAAa,EAAE,CAAC,IAAI,EAAE,CAAC;IACnD,CAAC;CACD;AAjFD,sDAiFC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { ConversationExtractor } from './_conversation';
|
|
2
|
+
import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
|
|
3
|
+
export declare class ChatGPTExtractor extends ConversationExtractor {
|
|
4
|
+
private articles;
|
|
5
|
+
private footnotes;
|
|
6
|
+
private footnoteCounter;
|
|
7
|
+
private cachedMessages;
|
|
8
|
+
constructor(document: Document, url: string);
|
|
9
|
+
canExtract(): boolean;
|
|
10
|
+
protected extractMessages(): ConversationMessage[];
|
|
11
|
+
protected getFootnotes(): Footnote[];
|
|
12
|
+
protected getMetadata(): ConversationMetadata;
|
|
13
|
+
private getTitle;
|
|
14
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ChatGPTExtractor = void 0;
|
|
4
|
+
const _conversation_1 = require("./_conversation");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
class ChatGPTExtractor extends _conversation_1.ConversationExtractor {
|
|
7
|
+
constructor(document, url) {
|
|
8
|
+
super(document, url);
|
|
9
|
+
this.cachedMessages = null;
|
|
10
|
+
this.articles = document.querySelectorAll('article[data-testid^="conversation-turn-"]');
|
|
11
|
+
this.footnotes = [];
|
|
12
|
+
this.footnoteCounter = 0;
|
|
13
|
+
}
|
|
14
|
+
canExtract() {
|
|
15
|
+
return !!this.articles && this.articles.length > 0;
|
|
16
|
+
}
|
|
17
|
+
extractMessages() {
|
|
18
|
+
if (this.cachedMessages)
|
|
19
|
+
return this.cachedMessages;
|
|
20
|
+
const messages = [];
|
|
21
|
+
this.footnotes = [];
|
|
22
|
+
this.footnoteCounter = 0;
|
|
23
|
+
if (!this.articles)
|
|
24
|
+
return messages;
|
|
25
|
+
this.articles.forEach((article) => {
|
|
26
|
+
// Get the localized author text from the sr-only heading and clean it
|
|
27
|
+
const authorElement = article.querySelector('h5.sr-only, h6.sr-only');
|
|
28
|
+
const authorText = authorElement?.textContent
|
|
29
|
+
?.trim()
|
|
30
|
+
?.replace(/:\s*$/, '') // Remove colon and any trailing whitespace
|
|
31
|
+
|| '';
|
|
32
|
+
let currentAuthorRole = '';
|
|
33
|
+
const authorRole = article.getAttribute('data-message-author-role');
|
|
34
|
+
if (authorRole) {
|
|
35
|
+
currentAuthorRole = authorRole;
|
|
36
|
+
}
|
|
37
|
+
let messageContent = (0, dom_1.serializeHTML)(article);
|
|
38
|
+
messageContent = messageContent.replace(/\u200B/g, '');
|
|
39
|
+
// Remove specific elements from the message content
|
|
40
|
+
const tempDiv = this.document.createElement('div');
|
|
41
|
+
tempDiv.appendChild((0, dom_1.parseHTML)(this.document, messageContent));
|
|
42
|
+
tempDiv.querySelectorAll('h5.sr-only, h6.sr-only, span[data-state="closed"]').forEach(el => el.remove());
|
|
43
|
+
messageContent = (0, dom_1.serializeHTML)(tempDiv);
|
|
44
|
+
// Process inline references using regex to find the containers
|
|
45
|
+
// Look for spans containing citation links (a[target=_blank][rel=noopener]), replacing entire structure
|
|
46
|
+
// Also capture optional preceding ZeroWidthSpace
|
|
47
|
+
const citationPattern = /(​)?(<span[^>]*?>\s*<a(?=[^>]*?href="([^"]+)")(?=[^>]*?target="_blank")(?=[^>]*?rel="noopener")[^>]*?>[\s\S]*?<\/a>\s*<\/span>)/gi;
|
|
48
|
+
messageContent = messageContent.replace(citationPattern, (match, zws, spanStructure, url) => {
|
|
49
|
+
// url is captured group 3
|
|
50
|
+
let domain = '';
|
|
51
|
+
let fragmentText = '';
|
|
52
|
+
try {
|
|
53
|
+
// Extract domain without www.
|
|
54
|
+
domain = new URL(url).hostname.replace(/^www\./, '');
|
|
55
|
+
// Extract and decode the fragment text if it exists
|
|
56
|
+
const hashParts = url.split('#:~:text=');
|
|
57
|
+
if (hashParts.length > 1) {
|
|
58
|
+
fragmentText = decodeURIComponent(hashParts[1]);
|
|
59
|
+
fragmentText = fragmentText.replace(/%2C/g, ',');
|
|
60
|
+
const parts = fragmentText.split(',');
|
|
61
|
+
if (parts.length > 1 && parts[0].trim()) {
|
|
62
|
+
fragmentText = ` — ${parts[0].trim()}...`;
|
|
63
|
+
}
|
|
64
|
+
else if (parts[0].trim()) {
|
|
65
|
+
fragmentText = ` — ${fragmentText.trim()}`;
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
fragmentText = '';
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
catch (e) {
|
|
73
|
+
console.error(`Failed to parse URL: ${url}`, e);
|
|
74
|
+
domain = url;
|
|
75
|
+
}
|
|
76
|
+
// Check if this URL already exists in our footnotes
|
|
77
|
+
let footnoteIndex = this.footnotes.findIndex(fn => fn.url === url);
|
|
78
|
+
let footnoteNumber;
|
|
79
|
+
if (footnoteIndex === -1) {
|
|
80
|
+
this.footnoteCounter++;
|
|
81
|
+
footnoteNumber = this.footnoteCounter;
|
|
82
|
+
this.footnotes.push({
|
|
83
|
+
url,
|
|
84
|
+
text: `<a href="${url}">${domain}</a>${fragmentText}`
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
footnoteNumber = footnoteIndex + 1;
|
|
89
|
+
}
|
|
90
|
+
// Return just the footnote reference, replacing the ZWS (if captured) and the entire span structure
|
|
91
|
+
return `<sup id="fnref:${footnoteNumber}"><a href="#fn:${footnoteNumber}">${footnoteNumber}</a></sup>`;
|
|
92
|
+
});
|
|
93
|
+
// Clean up any stray empty paragraph tags
|
|
94
|
+
messageContent = messageContent
|
|
95
|
+
.replace(/<p[^>]*>\s*<\/p>/g, '');
|
|
96
|
+
messages.push({
|
|
97
|
+
author: authorText,
|
|
98
|
+
content: messageContent.trim(),
|
|
99
|
+
metadata: {
|
|
100
|
+
role: currentAuthorRole || 'unknown'
|
|
101
|
+
}
|
|
102
|
+
});
|
|
103
|
+
});
|
|
104
|
+
this.cachedMessages = messages;
|
|
105
|
+
return messages;
|
|
106
|
+
}
|
|
107
|
+
getFootnotes() {
|
|
108
|
+
return this.footnotes;
|
|
109
|
+
}
|
|
110
|
+
getMetadata() {
|
|
111
|
+
const title = this.getTitle();
|
|
112
|
+
const messages = this.extractMessages();
|
|
113
|
+
return {
|
|
114
|
+
title,
|
|
115
|
+
site: 'ChatGPT',
|
|
116
|
+
url: this.url,
|
|
117
|
+
messageCount: messages.length,
|
|
118
|
+
description: `ChatGPT conversation with ${messages.length} messages`
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
getTitle() {
|
|
122
|
+
// Try to get the page title first
|
|
123
|
+
const pageTitle = this.document.title?.trim();
|
|
124
|
+
if (pageTitle && pageTitle !== 'ChatGPT') {
|
|
125
|
+
return pageTitle;
|
|
126
|
+
}
|
|
127
|
+
// Fall back to first user message
|
|
128
|
+
const firstUserTurn = this.articles?.item(0)?.querySelector('.text-message');
|
|
129
|
+
if (firstUserTurn) {
|
|
130
|
+
const text = firstUserTurn.textContent || '';
|
|
131
|
+
// Truncate to first 50 characters if longer
|
|
132
|
+
return text.length > 50 ? text.slice(0, 50) + '...' : text;
|
|
133
|
+
}
|
|
134
|
+
return 'ChatGPT Conversation';
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
exports.ChatGPTExtractor = ChatGPTExtractor;
|
|
138
|
+
//# sourceMappingURL=chatgpt.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chatgpt.js","sourceRoot":"","sources":["../../src/extractors/chatgpt.ts"],"names":[],"mappings":";;;AAAA,mDAAwD;AAExD,sCAAwD;AAExD,MAAa,gBAAiB,SAAQ,qCAAqB;IAM1D,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAHd,mBAAc,GAAiC,IAAI,CAAC;QAI3D,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC,gBAAgB,CAAC,4CAA4C,CAAC,CAAC;QACxF,IAAI,CAAC,SAAS,GAAG,EAAE,CAAC;QACpB,IAAI,CAAC,eAAe,GAAG,CAAC,CAAC;IAC1B,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;IACpD,CAAC;IAES,eAAe;QACxB,IAAI,IAAI,CAAC,cAAc;YAAE,OAAO,IAAI,CAAC,cAAc,CAAC;QAEpD,MAAM,QAAQ,GAA0B,EAAE,CAAC;QAC3C,IAAI,CAAC,SAAS,GAAG,EAAE,CAAC;QACpB,IAAI,CAAC,eAAe,GAAG,CAAC,CAAC;QAEzB,IAAI,CAAC,IAAI,CAAC,QAAQ;YAAE,OAAO,QAAQ,CAAC;QAEpC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YACjC,sEAAsE;YACtE,MAAM,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC,wBAAwB,CAAC,CAAC;YACtE,MAAM,UAAU,GAAG,aAAa,EAAE,WAAW;gBAC5C,EAAE,IAAI,EAAE;gBACR,EAAE,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,2CAA2C;mBAC/D,EAAE,CAAC;YAEP,IAAI,iBAAiB,GAAG,EAAE,CAAC;YAE3B,MAAM,UAAU,GAAG,OAAO,CAAC,YAAY,CAAC,0BAA0B,CAAC,CAAC;YACpE,IAAI,UAAU,EAAE,CAAC;gBAChB,iBAAiB,GAAG,UAAU,CAAC;YAChC,CAAC;YAED,IAAI,cAAc,GAAG,IAAA,mBAAa,EAAC,OAAO,CAAC,CAAC;YAC5C,cAAc,GAAG,cAAc,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;YAEvD,oDAAoD;YACpD,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YACnD,OAAO,CAAC,WAAW,CAAC,IAAA,eAAS,EAAC,IAAI,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC,CAAC;YAC9D,OAAO,CAAC,gBAAgB,CAAC,mDAAmD,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;YACzG,cAAc,GAAG,IAAA,mBAAa,EAAC,OAAO,CAAC,CAAC;YAExC,+DAA+D;YAC/D,wGAAwG;YACxG,iDAAiD;YACjD,MAAM,eAAe,GAAG,kJAAkJ,CAAC;YAE3K,cAAc,GAAG,cAAc,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE,aAAa,EAAE,GAAG,EAAE,EAAE;gBAC3F,0BAA0B;gBAC1B,IAAI,MAAM,GAAG,EAAE,CAAC;gBAChB,IAAI,YAAY,GAAG,EAAE,CAAC;gBAEtB,IAAI,CAAC;oBACJ,8BAA8B;oBAC9B,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;oBAErD,oDAAoD;oBACpD,MAAM,SAAS,GAAG,GAAG,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;oBACzC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC1B,YAAY,GAAG,kBAAkB,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;wBAChD,YAAY,GAAG,YAAY,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;wBAEjD,MAAM,KAAK,GAAG,YAAY,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;wBACtC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC;4BACzC,YAAY,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,CAAC;wBAC3C,CAAC;6BAAM,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC;4BAC5B,YAAY,GAAG,MAAM,YAAY,CAAC,IAAI,EAAE,EAAE,CAAC;wBAC5C,CAAC;6BAAM,CAAC;4BACP,YAAY,GAAG,EAAE,CAAC;wBACnB,CAAC;oBACF,CAAC;gBACF,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACZ,OAAO,CAAC,KAAK,CAAC,wBAAwB,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC;oBAChD,MAAM,GAAG,GAAG,CAAC;gBACd,CAAC;gBAED,oDAAoD;gBACpD,IAAI,aAAa,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC;gBACnE,IAAI,cAAsB,CAAC;gBAE3B,IAAI,aAAa,KAAK,CAAC,CAAC,EAAE,CAAC;oBAC1B,IAAI,CAAC,eAAe,EAAE,CAAC;oBACvB,cAAc,GAAG,IAAI,CAAC,eAAe,CAAC;oBACtC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;wBACnB,GAAG;wBACH,IAAI,EAAE,YAAY,GAAG,KAAK,MAAM,OAAO,YAAY,EAAE;qBACrD,CAAC,CAAC;gBACJ,CAAC;qBAAM,CAAC;oBACP,cAAc,GAAG,aAAa,GAAG,CAAC,CAAC;gBACpC,CAAC;gBAED,oGAAoG;gBACpG,OAAO,kBAAkB,cAAc,kBAAkB,cAAc,KAAK,cAAc,YAAY,CAAC;YACxG,CAAC,CAAC,CAAC;YAEH,0CAA0C;YAC1C,cAAc,GAAG,cAAc;iBAC7B,OAAO,CAAC,mBAAmB,EAAE,EAAE,CAAC,CAAC;YAEnC,QAAQ,CAAC,IAAI,CAAC;gBACb,MAAM,EAAE,UAAU;gBAClB,OAAO,EAAE,cAAc,CAAC,IAAI,EAAE;gBAC9B,QAAQ,EAAE;oBACT,IAAI,EAAE,iBAAiB,IAAI,SAAS;iBACpC;aACD,CAAC,CAAC;QAEJ,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,cAAc,GAAG,QAAQ,CAAC;QAC/B,OAAO,QAAQ,CAAC;IACjB,CAAC;IAES,YAAY;QACrB,OAAO,IAAI,CAAC,SAAS,CAAC;IACvB,CAAC;IAES,WAAW;QACpB,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,EAAE,CAAC;QAExC,OAAO;YACN,KAAK;YACL,IAAI,EAAE,SAAS;YACf,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,YAAY,EAAE,QAAQ,CAAC,MAAM;YAC7B,WAAW,EAAE,6BAA6B,QAAQ,CAAC,MAAM,WAAW;SACpE,CAAC;IACH,CAAC;IAEO,QAAQ;QACf,kCAAkC;QAClC,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC;QAC9C,IAAI,SAAS,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;YAC1C,OAAO,SAAS,CAAC;QAClB,CAAC;QAED,kCAAkC;QAClC,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,aAAa,CAAC,eAAe,CAAC,CAAC;QAC7E,IAAI,aAAa,EAAE,CAAC;YACnB,MAAM,IAAI,GAAG,aAAa,CAAC,WAAW,IAAI,EAAE,CAAC;YAC7C,4CAA4C;YAC5C,OAAO,IAAI,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;QAC5D,CAAC;QAED,OAAO,sBAAsB,CAAC;IAC/B,CAAC;CACD;AA3JD,4CA2JC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { ConversationExtractor } from './_conversation';
|
|
2
|
+
import { ConversationMessage, ConversationMetadata } from '../types/extractors';
|
|
3
|
+
export declare class ClaudeExtractor extends ConversationExtractor {
|
|
4
|
+
private articles;
|
|
5
|
+
constructor(document: Document, url: string);
|
|
6
|
+
canExtract(): boolean;
|
|
7
|
+
protected extractMessages(): ConversationMessage[];
|
|
8
|
+
protected getMetadata(): ConversationMetadata;
|
|
9
|
+
private getTitle;
|
|
10
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ClaudeExtractor = void 0;
|
|
4
|
+
const _conversation_1 = require("./_conversation");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
class ClaudeExtractor extends _conversation_1.ConversationExtractor {
|
|
7
|
+
constructor(document, url) {
|
|
8
|
+
super(document, url);
|
|
9
|
+
// Find all message blocks - both user and assistant messages
|
|
10
|
+
this.articles = document.querySelectorAll('div[data-testid="user-message"], div[data-testid="assistant-message"], div.font-claude-response');
|
|
11
|
+
}
|
|
12
|
+
canExtract() {
|
|
13
|
+
return !!this.articles && this.articles.length > 0;
|
|
14
|
+
}
|
|
15
|
+
extractMessages() {
|
|
16
|
+
const messages = [];
|
|
17
|
+
if (!this.articles)
|
|
18
|
+
return messages;
|
|
19
|
+
this.articles.forEach((article) => {
|
|
20
|
+
let role;
|
|
21
|
+
let content;
|
|
22
|
+
if (article.hasAttribute('data-testid')) {
|
|
23
|
+
// Handle user messages
|
|
24
|
+
if (article.getAttribute('data-testid') === 'user-message') {
|
|
25
|
+
role = 'you';
|
|
26
|
+
content = (0, dom_1.serializeHTML)(article);
|
|
27
|
+
}
|
|
28
|
+
// Skip non-message elements
|
|
29
|
+
else {
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
else if (article.classList.contains('font-claude-response')) {
|
|
34
|
+
// Handle Claude messages
|
|
35
|
+
role = 'assistant';
|
|
36
|
+
const assistantBody = article.querySelector('.standard-markdown') || article;
|
|
37
|
+
content = (0, dom_1.serializeHTML)(assistantBody);
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
// Skip unknown elements
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
43
|
+
if (content) {
|
|
44
|
+
// Normalize content similar to ChatGPT extractor
|
|
45
|
+
content = content.replace(/\u200B/g, '').replace(/<p[^>]*>\s*<\/p>/g, '');
|
|
46
|
+
messages.push({
|
|
47
|
+
author: role === 'you' ? 'You' : 'Claude',
|
|
48
|
+
content: content.trim(),
|
|
49
|
+
metadata: {
|
|
50
|
+
role: role
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
return messages;
|
|
56
|
+
}
|
|
57
|
+
getMetadata() {
|
|
58
|
+
const title = this.getTitle();
|
|
59
|
+
const messages = this.extractMessages();
|
|
60
|
+
return {
|
|
61
|
+
title,
|
|
62
|
+
site: 'Claude',
|
|
63
|
+
url: this.url,
|
|
64
|
+
messageCount: messages.length,
|
|
65
|
+
description: `Claude conversation with ${messages.length} messages`
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
getTitle() {
|
|
69
|
+
// Try to get the page title first
|
|
70
|
+
const pageTitle = this.document.title?.trim();
|
|
71
|
+
if (pageTitle && pageTitle !== 'Claude') {
|
|
72
|
+
// Remove ' - Claude' suffix if present
|
|
73
|
+
return pageTitle.replace(/ - Claude$/, '');
|
|
74
|
+
}
|
|
75
|
+
// Try to get title from header
|
|
76
|
+
const headerTitle = this.document.querySelector('header .font-tiempos')?.textContent?.trim();
|
|
77
|
+
if (headerTitle) {
|
|
78
|
+
return headerTitle;
|
|
79
|
+
}
|
|
80
|
+
// Fall back to first user message
|
|
81
|
+
const firstUserMessage = this.articles?.item(0)?.querySelector('[data-testid="user-message"]');
|
|
82
|
+
if (firstUserMessage) {
|
|
83
|
+
const text = firstUserMessage.textContent || '';
|
|
84
|
+
// Truncate to first 50 characters if longer
|
|
85
|
+
return text.length > 50 ? text.slice(0, 50) + '...' : text;
|
|
86
|
+
}
|
|
87
|
+
return 'Claude Conversation';
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
exports.ClaudeExtractor = ClaudeExtractor;
|
|
91
|
+
//# sourceMappingURL=claude.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claude.js","sourceRoot":"","sources":["../../src/extractors/claude.ts"],"names":[],"mappings":";;;AAAA,mDAAwD;AAExD,sCAA6C;AAE7C,MAAa,eAAgB,SAAQ,qCAAqB;IAGzD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACrB,6DAA6D;QAC7D,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC,gBAAgB,CAAC,iGAAiG,CAAC,CAAC;IAC9I,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;IACpD,CAAC;IAES,eAAe;QACxB,MAAM,QAAQ,GAA0B,EAAE,CAAC;QAE3C,IAAI,CAAC,IAAI,CAAC,QAAQ;YAAE,OAAO,QAAQ,CAAC;QAEpC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YACjC,IAAI,IAAY,CAAC;YACjB,IAAI,OAAe,CAAC;YAEpB,IAAI,OAAO,CAAC,YAAY,CAAC,aAAa,CAAC,EAAE,CAAC;gBACzC,uBAAuB;gBACvB,IAAI,OAAO,CAAC,YAAY,CAAC,aAAa,CAAC,KAAK,cAAc,EAAE,CAAC;oBAC5D,IAAI,GAAG,KAAK,CAAC;oBACb,OAAO,GAAG,IAAA,mBAAa,EAAC,OAAO,CAAC,CAAC;gBAClC,CAAC;gBACD,4BAA4B;qBACvB,CAAC;oBACL,OAAO;gBACR,CAAC;YACF,CAAC;iBAAM,IAAI,OAAO,CAAC,SAAS,CAAC,QAAQ,CAAC,sBAAsB,CAAC,EAAE,CAAC;gBAC/D,yBAAyB;gBACzB,IAAI,GAAG,WAAW,CAAC;gBACnB,MAAM,aAAa,GAAI,OAAO,CAAC,aAAa,CAAC,oBAAoB,CAAiB,IAAK,OAAuB,CAAC;gBAC/G,OAAO,GAAG,IAAA,mBAAa,EAAC,aAAa,CAAC,CAAC;YACxC,CAAC;iBAAM,CAAC;gBACP,wBAAwB;gBACxB,OAAO;YACR,CAAC;YAED,IAAI,OAAO,EAAE,CAAC;gBACb,iDAAiD;gBACjD,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,mBAAmB,EAAE,EAAE,CAAC,CAAC;gBAC1E,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ;oBACzC,OAAO,EAAE,OAAO,CAAC,IAAI,EAAE;oBACvB,QAAQ,EAAE;wBACT,IAAI,EAAE,IAAI;qBACV;iBACD,CAAC,CAAC;YACJ,CAAC;QACF,CAAC,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC;IACjB,CAAC;IAES,WAAW;QACpB,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,EAAE,CAAC;QAExC,OAAO;YACN,KAAK;YACL,IAAI,EAAE,QAAQ;YACd,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,YAAY,EAAE,QAAQ,CAAC,MAAM;YAC7B,WAAW,EAAE,4BAA4B,QAAQ,CAAC,MAAM,WAAW;SACnE,CAAC;IACH,CAAC;IAEO,QAAQ;QACf,kCAAkC;QAClC,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC;QAC9C,IAAI,SAAS,IAAI,SAAS,KAAK,QAAQ,EAAE,CAAC;YACzC,uCAAuC;YACvC,OAAO,SAAS,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC;QAC5C,CAAC;QAED,+BAA+B;QAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,sBAAsB,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAC7F,IAAI,WAAW,EAAE,CAAC;YACjB,OAAO,WAAW,CAAC;QACpB,CAAC;QAED,kCAAkC;QAClC,MAAM,gBAAgB,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,aAAa,CAAC,8BAA8B,CAAC,CAAC;QAC/F,IAAI,gBAAgB,EAAE,CAAC;YACtB,MAAM,IAAI,GAAG,gBAAgB,CAAC,WAAW,IAAI,EAAE,CAAC;YAChD,4CAA4C;YAC5C,OAAO,IAAI,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;QAC5D,CAAC;QAED,OAAO,qBAAqB,CAAC;IAC9B,CAAC;CACD;AA/FD,0CA+FC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { ConversationExtractor } from './_conversation';
|
|
2
|
+
import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
|
|
3
|
+
export declare class GeminiExtractor extends ConversationExtractor {
|
|
4
|
+
private conversationContainers;
|
|
5
|
+
private footnotes;
|
|
6
|
+
private messageCount;
|
|
7
|
+
constructor(document: Document, url: string);
|
|
8
|
+
canExtract(): boolean;
|
|
9
|
+
protected extractMessages(): ConversationMessage[];
|
|
10
|
+
private extractSources;
|
|
11
|
+
protected getFootnotes(): Footnote[];
|
|
12
|
+
protected getMetadata(): ConversationMetadata;
|
|
13
|
+
private getTitle;
|
|
14
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.GeminiExtractor = void 0;
|
|
4
|
+
const _conversation_1 = require("./_conversation");
|
|
5
|
+
const dom_1 = require("../utils/dom");
|
|
6
|
+
class GeminiExtractor extends _conversation_1.ConversationExtractor {
|
|
7
|
+
constructor(document, url) {
|
|
8
|
+
super(document, url);
|
|
9
|
+
this.messageCount = null;
|
|
10
|
+
this.conversationContainers = document.querySelectorAll('div.conversation-container');
|
|
11
|
+
this.footnotes = [];
|
|
12
|
+
}
|
|
13
|
+
canExtract() {
|
|
14
|
+
return !!this.conversationContainers && this.conversationContainers.length > 0;
|
|
15
|
+
}
|
|
16
|
+
extractMessages() {
|
|
17
|
+
this.messageCount = 0;
|
|
18
|
+
const messages = [];
|
|
19
|
+
if (!this.conversationContainers)
|
|
20
|
+
return messages;
|
|
21
|
+
this.extractSources();
|
|
22
|
+
this.conversationContainers.forEach((container) => {
|
|
23
|
+
const userQuery = container.querySelector('user-query');
|
|
24
|
+
if (userQuery) {
|
|
25
|
+
const queryText = userQuery.querySelector('.query-text');
|
|
26
|
+
if (queryText) {
|
|
27
|
+
const content = (0, dom_1.serializeHTML)(queryText);
|
|
28
|
+
messages.push({
|
|
29
|
+
author: 'You',
|
|
30
|
+
content: content.trim(),
|
|
31
|
+
metadata: { role: 'user' }
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
const modelResponse = container.querySelector('model-response');
|
|
36
|
+
if (modelResponse) {
|
|
37
|
+
const regularContent = modelResponse.querySelector('.model-response-text .markdown');
|
|
38
|
+
const extendedContent = modelResponse.querySelector('#extended-response-markdown-content');
|
|
39
|
+
const contentElement = extendedContent || regularContent;
|
|
40
|
+
if (contentElement) {
|
|
41
|
+
let content = (0, dom_1.serializeHTML)(contentElement);
|
|
42
|
+
const tempDiv = this.document.createElement('div');
|
|
43
|
+
tempDiv.appendChild((0, dom_1.parseHTML)(this.document, content));
|
|
44
|
+
tempDiv.querySelectorAll('.table-content').forEach(el => {
|
|
45
|
+
// `table-content` is a PARTIAL selector in defuddle (table of contents, will be removed), but a real table in Gemini (should be kept).
|
|
46
|
+
el.classList.remove('table-content');
|
|
47
|
+
});
|
|
48
|
+
content = (0, dom_1.serializeHTML)(tempDiv);
|
|
49
|
+
messages.push({
|
|
50
|
+
author: 'Gemini',
|
|
51
|
+
content: content.trim(),
|
|
52
|
+
metadata: { role: 'assistant' }
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
this.messageCount = messages.length;
|
|
58
|
+
return messages;
|
|
59
|
+
}
|
|
60
|
+
extractSources() {
|
|
61
|
+
const browseItems = this.document.querySelectorAll('browse-item');
|
|
62
|
+
if (browseItems && browseItems.length > 0) {
|
|
63
|
+
browseItems.forEach(item => {
|
|
64
|
+
const link = item.querySelector('a');
|
|
65
|
+
if (link instanceof HTMLAnchorElement) {
|
|
66
|
+
const url = link.href;
|
|
67
|
+
const domain = link.querySelector('.domain')?.textContent?.trim() || '';
|
|
68
|
+
const title = link.querySelector('.title')?.textContent?.trim() || '';
|
|
69
|
+
if (url && (domain || title)) {
|
|
70
|
+
this.footnotes.push({
|
|
71
|
+
url,
|
|
72
|
+
text: title ? `${domain}: ${title}` : domain
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
getFootnotes() {
|
|
80
|
+
return this.footnotes;
|
|
81
|
+
}
|
|
82
|
+
getMetadata() {
|
|
83
|
+
const title = this.getTitle();
|
|
84
|
+
const messageCount = this.messageCount ?? this.extractMessages().length;
|
|
85
|
+
return {
|
|
86
|
+
title,
|
|
87
|
+
site: 'Gemini',
|
|
88
|
+
url: this.url,
|
|
89
|
+
messageCount,
|
|
90
|
+
description: `Gemini conversation with ${messageCount} messages`
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
getTitle() {
|
|
94
|
+
const pageTitle = this.document.title?.trim();
|
|
95
|
+
if (pageTitle && pageTitle !== 'Gemini' && !pageTitle.includes('Gemini')) {
|
|
96
|
+
return pageTitle;
|
|
97
|
+
}
|
|
98
|
+
const researchTitle = this.document.querySelector('.title-text')?.textContent?.trim();
|
|
99
|
+
if (researchTitle) {
|
|
100
|
+
return researchTitle;
|
|
101
|
+
}
|
|
102
|
+
const firstUserQuery = this.conversationContainers?.item(0)?.querySelector('.query-text');
|
|
103
|
+
if (firstUserQuery) {
|
|
104
|
+
const text = firstUserQuery.textContent || '';
|
|
105
|
+
return text.length > 50 ? text.slice(0, 50) + '...' : text;
|
|
106
|
+
}
|
|
107
|
+
return 'Gemini Conversation';
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
exports.GeminiExtractor = GeminiExtractor;
|
|
111
|
+
//# sourceMappingURL=gemini.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gemini.js","sourceRoot":"","sources":["../../src/extractors/gemini.ts"],"names":[],"mappings":";;;AAAA,mDAAwD;AAExD,sCAAwD;AAExD,MAAa,eAAgB,SAAQ,qCAAqB;IAKzD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAHd,iBAAY,GAAkB,IAAI,CAAC;QAI1C,IAAI,CAAC,sBAAsB,GAAG,QAAQ,CAAC,gBAAgB,CAAC,4BAA4B,CAAC,CAAC;QACtF,IAAI,CAAC,SAAS,GAAG,EAAE,CAAC;IACrB,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,sBAAsB,IAAI,IAAI,CAAC,sBAAsB,CAAC,MAAM,GAAG,CAAC,CAAC;IAChF,CAAC;IAES,eAAe;QACxB,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC;QACtB,MAAM,QAAQ,GAA0B,EAAE,CAAC;QAE3C,IAAI,CAAC,IAAI,CAAC,sBAAsB;YAAE,OAAO,QAAQ,CAAC;QAElD,IAAI,CAAC,cAAc,EAAE,CAAC;QAEtB,IAAI,CAAC,sBAAsB,CAAC,OAAO,CAAC,CAAC,SAAS,EAAE,EAAE;YACjD,MAAM,SAAS,GAAG,SAAS,CAAC,aAAa,CAAC,YAAY,CAAC,CAAC;YACxD,IAAI,SAAS,EAAE,CAAC;gBACf,MAAM,SAAS,GAAG,SAAS,CAAC,aAAa,CAAC,aAAa,CAAC,CAAC;gBACzD,IAAI,SAAS,EAAE,CAAC;oBACf,MAAM,OAAO,GAAG,IAAA,mBAAa,EAAC,SAAS,CAAC,CAAC;oBACzC,QAAQ,CAAC,IAAI,CAAC;wBACb,MAAM,EAAE,KAAK;wBACb,OAAO,EAAE,OAAO,CAAC,IAAI,EAAE;wBACvB,QAAQ,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE;qBAC1B,CAAC,CAAC;gBACJ,CAAC;YACF,CAAC;YAED,MAAM,aAAa,GAAG,SAAS,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;YAChE,IAAI,aAAa,EAAE,CAAC;gBACnB,MAAM,cAAc,GAAG,aAAa,CAAC,aAAa,CAAC,gCAAgC,CAAC,CAAC;gBACrF,MAAM,eAAe,GAAG,aAAa,CAAC,aAAa,CAAC,qCAAqC,CAAC,CAAC;gBAC3F,MAAM,cAAc,GAAG,eAAe,IAAI,cAAc,CAAC;gBAEzD,IAAI,cAAc,EAAE,CAAC;oBACpB,IAAI,OAAO,GAAG,IAAA,mBAAa,EAAC,cAAc,CAAC,CAAC;oBAE5C,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;oBACnD,OAAO,CAAC,WAAW,CAAC,IAAA,eAAS,EAAC,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;oBAEvD,OAAO,CAAC,gBAAgB,CAAC,gBAAgB,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE;wBACvD,uIAAuI;wBACvI,EAAE,CAAC,SAAS,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;oBACtC,CAAC,CAAC,CAAC;oBAEH,OAAO,GAAG,IAAA,mBAAa,EAAC,OAAO,CAAC,CAAC;oBAEjC,QAAQ,CAAC,IAAI,CAAC;wBACb,MAAM,EAAE,QAAQ;wBAChB,OAAO,EAAE,OAAO,CAAC,IAAI,EAAE;wBACvB,QAAQ,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE;qBAC/B,CAAC,CAAC;gBACJ,CAAC;YACF,CAAC;QACF,CAAC,CAAC,CAAC;QACH,IAAI,CAAC,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC;QACpC,OAAO,QAAQ,CAAC;IACjB,CAAC;IAEO,cAAc;QACrB,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC;QAElE,IAAI,WAAW,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3C,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;gBAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;gBACrC,IAAI,IAAI,YAAY,iBAAiB,EAAE,CAAC;oBACvC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC;oBACtB,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;oBACxE,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;oBAEtE,IAAI,GAAG,IAAI,CAAC,MAAM,IAAI,KAAK,CAAC,EAAE,CAAC;wBAC9B,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;4BACnB,GAAG;4BACH,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,GAAG,MAAM,KAAK,KAAK,EAAE,CAAC,CAAC,CAAC,MAAM;yBAC5C,CAAC,CAAC;oBACJ,CAAC;gBACF,CAAC;YACF,CAAC,CAAC,CAAC;QACJ,CAAC;IACF,CAAC;IAES,YAAY;QACrB,OAAO,IAAI,CAAC,SAAS,CAAC;IACvB,CAAC;IAES,WAAW;QACpB,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,eAAe,EAAE,CAAC,MAAM,CAAC;QACxE,OAAO;YACN,KAAK;YACL,IAAI,EAAE,QAAQ;YACd,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,YAAY;YACZ,WAAW,EAAE,4BAA4B,YAAY,WAAW;SAChE,CAAC;IACH,CAAC;IAEO,QAAQ;QACf,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC;QAC9C,IAAI,SAAS,IAAI,SAAS,KAAK,QAAQ,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1E,OAAO,SAAS,CAAC;QAClB,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,aAAa,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QACtF,IAAI,aAAa,EAAE,CAAC;YACnB,OAAO,aAAa,CAAC;QACtB,CAAC;QAED,MAAM,cAAc,GAAG,IAAI,CAAC,sBAAsB,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,aAAa,CAAC,aAAa,CAAC,CAAC;QAC1F,IAAI,cAAc,EAAE,CAAC;YACpB,MAAM,IAAI,GAAG,cAAc,CAAC,WAAW,IAAI,EAAE,CAAC;YAC9C,OAAO,IAAI,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;QAC5D,CAAC;QAED,OAAO,qBAAqB,CAAC;IAC9B,CAAC;CACD;AA7HD,0CA6HC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { BaseExtractor } from './_base';
|
|
2
|
+
import { ExtractorResult } from '../types/extractors';
|
|
3
|
+
export declare class GitHubExtractor extends BaseExtractor {
|
|
4
|
+
private isIssue;
|
|
5
|
+
private isPR;
|
|
6
|
+
constructor(document: Document, url: string);
|
|
7
|
+
canExtract(): boolean;
|
|
8
|
+
extract(): ExtractorResult;
|
|
9
|
+
private createContentHtml;
|
|
10
|
+
private getIssueContent;
|
|
11
|
+
private extractComments;
|
|
12
|
+
private getPRBody;
|
|
13
|
+
private getPRContent;
|
|
14
|
+
private extractPRComments;
|
|
15
|
+
private extractAuthor;
|
|
16
|
+
private cleanBodyContent;
|
|
17
|
+
private extractNumber;
|
|
18
|
+
private extractRepoInfo;
|
|
19
|
+
private createDescription;
|
|
20
|
+
}
|