defuddle 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.js +14 -4
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +5 -1
- package/dist/defuddle.js +73 -40
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/code.js +4 -1
- package/dist/elements/code.js.map +1 -1
- package/dist/elements/images.js +153 -77
- package/dist/elements/images.js.map +1 -1
- package/dist/extractor-registry.js +7 -0
- package/dist/extractor-registry.js.map +1 -1
- package/dist/extractors/chatgpt.js +12 -21
- package/dist/extractors/chatgpt.js.map +1 -1
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +141 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/markdown.js +1 -1
- package/dist/markdown.js.map +1 -1
- package/dist/standardize.js +3 -2
- package/dist/standardize.js.map +1 -1
- package/dist/types.d.ts +19 -1
- package/package.json +1 -1
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.GrokExtractor = void 0;
|
|
4
|
+
const _conversation_1 = require("./_conversation");
|
|
5
|
+
class GrokExtractor extends _conversation_1.ConversationExtractor {
|
|
6
|
+
constructor(document, url) {
|
|
7
|
+
super(document, url);
|
|
8
|
+
// Note: This selector relies heavily on CSS utility classes and may break if Grok's UI changes.
|
|
9
|
+
this.messageContainerSelector = '.relative.group.flex.flex-col.justify-center.w-full';
|
|
10
|
+
this.messageBubbles = document.querySelectorAll(this.messageContainerSelector);
|
|
11
|
+
this.footnotes = [];
|
|
12
|
+
this.footnoteCounter = 0;
|
|
13
|
+
}
|
|
14
|
+
canExtract() {
|
|
15
|
+
return !!this.messageBubbles && this.messageBubbles.length > 0;
|
|
16
|
+
}
|
|
17
|
+
extractMessages() {
|
|
18
|
+
const messages = [];
|
|
19
|
+
this.footnotes = [];
|
|
20
|
+
this.footnoteCounter = 0;
|
|
21
|
+
if (!this.messageBubbles || this.messageBubbles.length === 0)
|
|
22
|
+
return messages;
|
|
23
|
+
this.messageBubbles.forEach((container) => {
|
|
24
|
+
// Note: Relies on layout classes 'items-end' and 'items-start' which might change.
|
|
25
|
+
const isUserMessage = container.classList.contains('items-end');
|
|
26
|
+
const isGrokMessage = container.classList.contains('items-start');
|
|
27
|
+
if (!isUserMessage && !isGrokMessage)
|
|
28
|
+
return; // Skip elements that aren't clearly user or Grok messages
|
|
29
|
+
const messageBubble = container.querySelector('.message-bubble');
|
|
30
|
+
if (!messageBubble)
|
|
31
|
+
return; // Skip if the core message bubble isn't found
|
|
32
|
+
let content = '';
|
|
33
|
+
let role = '';
|
|
34
|
+
let author = '';
|
|
35
|
+
if (isUserMessage) {
|
|
36
|
+
// Assume user message bubble's textContent is the desired content.
|
|
37
|
+
// This is simpler and potentially less brittle than selecting specific spans.
|
|
38
|
+
content = messageBubble.textContent || '';
|
|
39
|
+
role = 'user';
|
|
40
|
+
author = 'You'; // Or potentially extract from an attribute if available later
|
|
41
|
+
}
|
|
42
|
+
else if (isGrokMessage) {
|
|
43
|
+
role = 'assistant';
|
|
44
|
+
author = 'Grok'; // Or potentially extract from an attribute if available later
|
|
45
|
+
// Clone the bubble to modify it without affecting the original page
|
|
46
|
+
const clonedBubble = messageBubble.cloneNode(true);
|
|
47
|
+
// Remove known non-content elements like the DeepSearch artifact
|
|
48
|
+
clonedBubble.querySelector('.relative.border.border-border-l1.bg-surface-base')?.remove();
|
|
49
|
+
// Add selectors here for any other known elements to remove (e.g., buttons, toolbars within the bubble)
|
|
50
|
+
content = clonedBubble.innerHTML;
|
|
51
|
+
// Process footnotes/links in the cleaned content
|
|
52
|
+
content = this.processFootnotes(content);
|
|
53
|
+
}
|
|
54
|
+
if (content.trim()) {
|
|
55
|
+
messages.push({
|
|
56
|
+
author: author,
|
|
57
|
+
content: content.trim(),
|
|
58
|
+
metadata: {
|
|
59
|
+
role: role
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
return messages;
|
|
65
|
+
}
|
|
66
|
+
getFootnotes() {
|
|
67
|
+
return this.footnotes;
|
|
68
|
+
}
|
|
69
|
+
getMetadata() {
|
|
70
|
+
const title = this.getTitle();
|
|
71
|
+
const messageCount = this.messageBubbles?.length || 0;
|
|
72
|
+
return {
|
|
73
|
+
title,
|
|
74
|
+
site: 'Grok',
|
|
75
|
+
url: this.url,
|
|
76
|
+
messageCount: messageCount, // Use estimated count
|
|
77
|
+
description: `Grok conversation with ${messageCount} messages`
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
getTitle() {
|
|
81
|
+
// Try to get the page title first (more reliable)
|
|
82
|
+
const pageTitle = this.document.title?.trim();
|
|
83
|
+
if (pageTitle && pageTitle !== 'Grok' && !pageTitle.startsWith('Grok by ')) {
|
|
84
|
+
// Remove ' - Grok' suffix if present
|
|
85
|
+
return pageTitle.replace(/\s-\s*Grok$/, '').trim();
|
|
86
|
+
}
|
|
87
|
+
// Fallback: Find the first user message bubble and use its text content
|
|
88
|
+
// Note: Still relies on 'items-end' class.
|
|
89
|
+
const firstUserContainer = this.document.querySelector(`${this.messageContainerSelector}.items-end`);
|
|
90
|
+
if (firstUserContainer) {
|
|
91
|
+
const messageBubble = firstUserContainer.querySelector('.message-bubble');
|
|
92
|
+
if (messageBubble) {
|
|
93
|
+
const text = messageBubble.textContent?.trim() || '';
|
|
94
|
+
// Truncate to first 50 characters if longer
|
|
95
|
+
return text.length > 50 ? text.slice(0, 50) + '...' : text;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
return 'Grok Conversation'; // Default fallback
|
|
99
|
+
}
|
|
100
|
+
processFootnotes(content) {
|
|
101
|
+
// Regex to find <a> tags, capture href and link text
|
|
102
|
+
const linkPattern = /<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)<\/a>/gi; // Use 'g' and 'i' flags
|
|
103
|
+
return content.replace(linkPattern, (match, url, linkText) => {
|
|
104
|
+
// Skip processing for internal anchor links, empty URLs, or non-http(s) protocols
|
|
105
|
+
if (!url || url.startsWith('#') || !url.match(/^https?:\/\//i)) {
|
|
106
|
+
return match;
|
|
107
|
+
}
|
|
108
|
+
// Check if this URL already exists in our footnotes
|
|
109
|
+
let footnote = this.footnotes.find(fn => fn.url === url);
|
|
110
|
+
let footnoteIndex;
|
|
111
|
+
if (!footnote) {
|
|
112
|
+
// Create a new footnote if URL doesn't exist
|
|
113
|
+
this.footnoteCounter++;
|
|
114
|
+
footnoteIndex = this.footnoteCounter;
|
|
115
|
+
let domainText = url; // Default to full URL if parsing fails
|
|
116
|
+
try {
|
|
117
|
+
const domain = new URL(url).hostname.replace(/^www\./, '');
|
|
118
|
+
domainText = `<a href="${url}" target="_blank" rel="noopener noreferrer">${domain}</a>`;
|
|
119
|
+
}
|
|
120
|
+
catch (e) {
|
|
121
|
+
// Keep domainText as the original URL if parsing fails
|
|
122
|
+
domainText = `<a href="${url}" target="_blank" rel="noopener noreferrer">${url}</a>`;
|
|
123
|
+
console.warn(`GrokExtractor: Could not parse URL for footnote: ${url}`);
|
|
124
|
+
}
|
|
125
|
+
this.footnotes.push({
|
|
126
|
+
url,
|
|
127
|
+
text: domainText // Store the link HTML directly
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
// Find the 1-based index of the existing footnote
|
|
132
|
+
footnoteIndex = this.footnotes.findIndex(fn => fn.url === url) + 1;
|
|
133
|
+
}
|
|
134
|
+
// Return the original link text wrapped with a footnote reference
|
|
135
|
+
// Ensure the link text itself is not clickable again if it was part of the original match
|
|
136
|
+
return `${linkText}<sup id="fnref:${footnoteIndex}" class="footnote-ref"><a href="#fn:${footnoteIndex}" class="footnote-link">${footnoteIndex}</a></sup>`;
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
exports.GrokExtractor = GrokExtractor;
|
|
141
|
+
//# sourceMappingURL=grok.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"grok.js","sourceRoot":"","sources":["../../src/extractors/grok.ts"],"names":[],"mappings":";;;AAAA,mDAAwD;AAGxD,MAAa,aAAc,SAAQ,qCAAqB;IAOvD,YAAY,QAAkB,EAAE,GAAW;QAC1C,KAAK,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAPtB,gGAAgG;QACxF,6BAAwB,GAAG,qDAAqD,CAAC;QAOxF,IAAI,CAAC,cAAc,GAAG,QAAQ,CAAC,gBAAgB,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QAC/E,IAAI,CAAC,SAAS,GAAG,EAAE,CAAC;QACpB,IAAI,CAAC,eAAe,GAAG,CAAC,CAAC;IAC1B,CAAC;IAED,UAAU;QACT,OAAO,CAAC,CAAC,IAAI,CAAC,cAAc,IAAI,IAAI,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,CAAC;IAChE,CAAC;IAES,eAAe;QACxB,MAAM,QAAQ,GAA0B,EAAE,CAAC;QAC3C,IAAI,CAAC,SAAS,GAAG,EAAE,CAAC;QACpB,IAAI,CAAC,eAAe,GAAG,CAAC,CAAC;QAEzB,IAAI,CAAC,IAAI,CAAC,cAAc,IAAI,IAAI,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,QAAQ,CAAC;QAE9E,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC,SAAS,EAAE,EAAE;YACzC,mFAAmF;YACnF,MAAM,aAAa,GAAG,SAAS,CAAC,SAAS,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC;YAChE,MAAM,aAAa,GAAG,SAAS,CAAC,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC;YAElE,IAAI,CAAC,aAAa,IAAI,CAAC,aAAa;gBAAE,OAAO,CAAC,0DAA0D;YAExG,MAAM,aAAa,GAAG,SAAS,CAAC,aAAa,CAAC,iBAAiB,CAAC,CAAC;YACjE,IAAI,CAAC,aAAa;gBAAE,OAAO,CAAC,8CAA8C;YAE1E,IAAI,OAAO,GAAW,EAAE,CAAC;YACzB,IAAI,IAAI,GAAW,EAAE,CAAC;YACtB,IAAI,MAAM,GAAW,EAAE,CAAC;YAExB,IAAI,aAAa,EAAE,CAAC;gBACnB,mEAAmE;gBACnE,8EAA8E;gBAC9E,OAAO,GAAG,aAAa,CAAC,WAAW,IAAI,EAAE,CAAC;gBAC1C,IAAI,GAAG,MAAM,CAAC;gBACd,MAAM,GAAG,KAAK,CAAC,CAAC,8DAA8D;YAC/E,CAAC;iBAAM,IAAI,aAAa,EAAE,CAAC;gBAC1B,IAAI,GAAG,WAAW,CAAC;gBACnB,MAAM,GAAG,MAAM,CAAC,CAAC,8DAA8D;gBAE/E,oEAAoE;gBACpE,MAAM,YAAY,GAAG,aAAa,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;gBAE9D,iEAAiE;gBACjE,YAAY,CAAC,aAAa,CAAC,mDAAmD,CAAC,EAAE,MAAM,EAAE,CAAC;gBAC1F,wGAAwG;gBAExG,OAAO,GAAG,YAAY,CAAC,SAAS,CAAC;gBAEjC,iDAAiD;gBACjD,OAAO,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YAC1C,CAAC;YAED,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;gBACpB,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,MAAM;oBACd,OAAO,EAAE,OAAO,CAAC,IAAI,EAAE;oBACvB,QAAQ,EAAE;wBACT,IAAI,EAAE,IAAI;qBACV;iBACD,CAAC,CAAC;YACJ,CAAC;QACF,CAAC,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC;IACjB,CAAC;IAES,YAAY;QACrB,OAAO,IAAI,CAAC,SAAS,CAAC;IACvB,CAAC;IAES,WAAW;QACpB,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,YAAY,GAAG,IAAI,CAAC,cAAc,EAAE,MAAM,IAAI,CAAC,CAAC;QAEtD,OAAO;YACN,KAAK;YACL,IAAI,EAAE,MAAM;YACZ,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,YAAY,EAAE,YAAY,EAAE,sBAAsB;YAClD,WAAW,EAAE,0BAA0B,YAAY,WAAW;SAC9D,CAAC;IACH,CAAC;IAEO,QAAQ;QACf,kDAAkD;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC;QAC9C,IAAI,SAAS,IAAI,SAAS,KAAK,MAAM,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5E,qCAAqC;YACrC,OAAO,SAAS,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACpD,CAAC;QAED,wEAAwE;QACxE,2CAA2C;QAC3C,MAAM,kBAAkB,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,GAAG,IAAI,CAAC,wBAAwB,YAAY,CAAC,CAAC;QACrG,IAAI,kBAAkB,EAAE,CAAC;YACxB,MAAM,aAAa,GAAG,kBAAkB,CAAC,aAAa,CAAC,iBAAiB,CAAC,CAAC;YAC1E,IAAI,aAAa,EAAE,CAAC;gBACnB,MAAM,IAAI,GAAG,aAAa,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;gBACrD,4CAA4C;gBAC5C,OAAO,IAAI,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;YAC5D,CAAC;QACF,CAAC;QAED,OAAO,mBAAmB,CAAC,CAAC,mBAAmB;IAChD,CAAC;IAEO,gBAAgB,CAAC,OAAe;QACvC,qDAAqD;QACrD,MAAM,WAAW,GAAG,qDAAqD,CAAC,CAAC,wBAAwB;QAEnG,OAAO,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE,QAAQ,EAAE,EAAE;YAC3D,kFAAkF;YACnF,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,eAAe,CAAC,EAAE,CAAC;gBAChE,OAAO,KAAK,CAAC;YACd,CAAC;YAED,oDAAoD;YACpD,IAAI,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC;YACzD,IAAI,aAAqB,CAAC;YAE1B,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACf,6CAA6C;gBAC7C,IAAI,CAAC,eAAe,EAAE,CAAC;gBACvB,aAAa,GAAG,IAAI,CAAC,eAAe,CAAC;gBAErC,IAAI,UAAU,GAAG,GAAG,CAAC,CAAC,uCAAuC;gBAC7D,IAAI,CAAC;oBACJ,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;oBAC3D,UAAU,GAAG,YAAY,GAAG,+CAA+C,MAAM,MAAM,CAAC;gBACzF,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACZ,uDAAuD;oBACvD,UAAU,GAAG,YAAY,GAAG,+CAA+C,GAAG,MAAM,CAAC;oBACrF,OAAO,CAAC,IAAI,CAAC,oDAAoD,GAAG,EAAE,CAAC,CAAC;gBACzE,CAAC;gBAED,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;oBACnB,GAAG;oBACH,IAAI,EAAE,UAAU,CAAC,+BAA+B;iBAChD,CAAC,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACP,kDAAkD;gBAClD,aAAa,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC;YACpE,CAAC;YAED,kEAAkE;YAClE,0FAA0F;YAC1F,OAAO,GAAG,QAAQ,kBAAkB,aAAa,uCAAuC,aAAa,2BAA2B,aAAa,YAAY,CAAC;QAC3J,CAAC,CAAC,CAAC;IACJ,CAAC;CACD;AA/JD,sCA+JC"}
|