defuddle 0.3.8 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -48
- package/dist/constants.js +744 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +3 -2
- package/dist/defuddle.js +1676 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/code.js +287 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/headings.js +95 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/math.base.js +192 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.full.d.ts +1 -1
- package/dist/elements/math.full.js +121 -0
- package/dist/elements/math.full.js.map +1 -0
- package/dist/extractor-registry.d.ts +15 -0
- package/dist/extractor-registry.js +101 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +9 -0
- package/dist/extractors/_base.js +12 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +77 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +13 -0
- package/dist/extractors/chatgpt.js +142 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +87 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +206 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +16 -0
- package/dist/extractors/reddit.js +143 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +199 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/youtube.d.ts +12 -0
- package/dist/extractors/youtube.js +53 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/index.full.js +19181 -1
- package/dist/index.full.js.map +1 -0
- package/dist/index.js +6 -1
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +1 -0
- package/dist/markdown.js +545 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.js +268 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +12 -0
- package/dist/node.js +50 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +8 -0
- package/dist/scoring.js +95 -0
- package/dist/scoring.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +14 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +19 -5
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.mathSelectors = exports.isBlockDisplay = exports.getBasicLatexFromElement = exports.getMathMLFromElement = void 0;
|
|
4
|
+
const getMathMLFromElement = (el) => {
|
|
5
|
+
// 1. Direct MathML content
|
|
6
|
+
if (el.tagName.toLowerCase() === 'math') {
|
|
7
|
+
const isBlock = el.getAttribute('display') === 'block';
|
|
8
|
+
return {
|
|
9
|
+
mathml: el.outerHTML,
|
|
10
|
+
latex: el.getAttribute('alttext') || null,
|
|
11
|
+
isBlock
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
// 2. MathML in data-mathml attribute
|
|
15
|
+
const mathmlStr = el.getAttribute('data-mathml');
|
|
16
|
+
if (mathmlStr) {
|
|
17
|
+
const tempDiv = document.createElement('div');
|
|
18
|
+
tempDiv.innerHTML = mathmlStr;
|
|
19
|
+
const mathElement = tempDiv.querySelector('math');
|
|
20
|
+
if (mathElement) {
|
|
21
|
+
const isBlock = mathElement.getAttribute('display') === 'block';
|
|
22
|
+
return {
|
|
23
|
+
mathml: mathElement.outerHTML,
|
|
24
|
+
latex: mathElement.getAttribute('alttext') || null,
|
|
25
|
+
isBlock
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
// 3. MathJax assistive MathML
|
|
30
|
+
const assistiveMmlContainer = el.querySelector('.MJX_Assistive_MathML, mjx-assistive-mml');
|
|
31
|
+
if (assistiveMmlContainer) {
|
|
32
|
+
const mathElement = assistiveMmlContainer.querySelector('math');
|
|
33
|
+
if (mathElement) {
|
|
34
|
+
// Check both the math element and container for display mode
|
|
35
|
+
const mathDisplayAttr = mathElement.getAttribute('display');
|
|
36
|
+
const containerDisplayAttr = assistiveMmlContainer.getAttribute('display');
|
|
37
|
+
const isBlock = mathDisplayAttr === 'block' || containerDisplayAttr === 'block';
|
|
38
|
+
return {
|
|
39
|
+
mathml: mathElement.outerHTML,
|
|
40
|
+
latex: mathElement.getAttribute('alttext') || null,
|
|
41
|
+
isBlock
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// 4. KaTeX MathML
|
|
46
|
+
const katexMathml = el.querySelector('.katex-mathml math');
|
|
47
|
+
if (katexMathml) {
|
|
48
|
+
return {
|
|
49
|
+
mathml: katexMathml.outerHTML,
|
|
50
|
+
latex: null, // We'll get LaTeX separately for KaTeX
|
|
51
|
+
isBlock: false // We'll determine this from container
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
return null;
|
|
55
|
+
};
|
|
56
|
+
exports.getMathMLFromElement = getMathMLFromElement;
|
|
57
|
+
const getBasicLatexFromElement = (el) => {
|
|
58
|
+
// Direct data-latex attribute
|
|
59
|
+
const dataLatex = el.getAttribute('data-latex');
|
|
60
|
+
if (dataLatex) {
|
|
61
|
+
return dataLatex;
|
|
62
|
+
}
|
|
63
|
+
// WordPress LaTeX images
|
|
64
|
+
if (el.tagName.toLowerCase() === 'img' && el.classList.contains('latex')) {
|
|
65
|
+
// Try alt text first as it's cleaner
|
|
66
|
+
const altLatex = el.getAttribute('alt');
|
|
67
|
+
if (altLatex) {
|
|
68
|
+
return altLatex;
|
|
69
|
+
}
|
|
70
|
+
// Fallback to extracting from URL
|
|
71
|
+
const src = el.getAttribute('src');
|
|
72
|
+
if (src) {
|
|
73
|
+
const match = src.match(/latex\.php\?latex=([^&]+)/);
|
|
74
|
+
if (match) {
|
|
75
|
+
return decodeURIComponent(match[1])
|
|
76
|
+
.replace(/\+/g, ' ') // Replace + with spaces
|
|
77
|
+
.replace(/%5C/g, '\\'); // Fix escaped backslashes
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// LaTeX in annotation
|
|
82
|
+
const annotation = el.querySelector('annotation[encoding="application/x-tex"]');
|
|
83
|
+
if (annotation?.textContent) {
|
|
84
|
+
return annotation.textContent.trim();
|
|
85
|
+
}
|
|
86
|
+
// KaTeX formats
|
|
87
|
+
if (el.matches('.katex')) {
|
|
88
|
+
const katexAnnotation = el.querySelector('.katex-mathml annotation[encoding="application/x-tex"]');
|
|
89
|
+
if (katexAnnotation?.textContent) {
|
|
90
|
+
return katexAnnotation.textContent.trim();
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
// MathJax scripts
|
|
94
|
+
if (el.matches('script[type="math/tex"]') || el.matches('script[type="math/tex; mode=display"]')) {
|
|
95
|
+
return el.textContent?.trim() || null;
|
|
96
|
+
}
|
|
97
|
+
// Check for sibling script element
|
|
98
|
+
if (el.parentElement) {
|
|
99
|
+
const siblingScript = el.parentElement.querySelector('script[type="math/tex"], script[type="math/tex; mode=display"]');
|
|
100
|
+
if (siblingScript) {
|
|
101
|
+
return siblingScript.textContent?.trim() || null;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// Fallback to alt text or text content
|
|
105
|
+
return el.getAttribute('alt') || el.textContent?.trim() || null;
|
|
106
|
+
};
|
|
107
|
+
exports.getBasicLatexFromElement = getBasicLatexFromElement;
|
|
108
|
+
const isBlockDisplay = (el) => {
|
|
109
|
+
// Check explicit display attribute
|
|
110
|
+
const displayAttr = el.getAttribute('display');
|
|
111
|
+
if (displayAttr === 'block') {
|
|
112
|
+
return true;
|
|
113
|
+
}
|
|
114
|
+
// Check common class names
|
|
115
|
+
const classNames = el.className.toLowerCase();
|
|
116
|
+
if (classNames.includes('display') || classNames.includes('block')) {
|
|
117
|
+
return true;
|
|
118
|
+
}
|
|
119
|
+
// Check container classes
|
|
120
|
+
const container = el.closest('.katex-display, .MathJax_Display, [data-display="block"]');
|
|
121
|
+
if (container) {
|
|
122
|
+
return true;
|
|
123
|
+
}
|
|
124
|
+
// Check if preceded by block element
|
|
125
|
+
const prevElement = el.previousElementSibling;
|
|
126
|
+
if (prevElement?.tagName.toLowerCase() === 'p') {
|
|
127
|
+
return true;
|
|
128
|
+
}
|
|
129
|
+
// Check specific formats
|
|
130
|
+
if (el.matches('.mwe-math-fallback-image-display')) {
|
|
131
|
+
return true;
|
|
132
|
+
}
|
|
133
|
+
// Check KaTeX display mode
|
|
134
|
+
if (el.matches('.katex')) {
|
|
135
|
+
// KaTeX elements are inline by default
|
|
136
|
+
// Only block if explicitly marked as display
|
|
137
|
+
return el.closest('.katex-display') !== null;
|
|
138
|
+
}
|
|
139
|
+
// Check MathJax v3 display attribute
|
|
140
|
+
if (el.hasAttribute('display')) {
|
|
141
|
+
return el.getAttribute('display') === 'true';
|
|
142
|
+
}
|
|
143
|
+
// Check MathJax script display attribute
|
|
144
|
+
if (el.matches('script[type="math/tex; mode=display"]')) {
|
|
145
|
+
return true;
|
|
146
|
+
}
|
|
147
|
+
if (el.hasAttribute('display')) {
|
|
148
|
+
return el.getAttribute('display') === 'true';
|
|
149
|
+
}
|
|
150
|
+
// Check parent container display attribute
|
|
151
|
+
const parentContainer = el.closest('[display]');
|
|
152
|
+
if (parentContainer) {
|
|
153
|
+
return parentContainer.getAttribute('display') === 'true';
|
|
154
|
+
}
|
|
155
|
+
return false;
|
|
156
|
+
};
|
|
157
|
+
exports.isBlockDisplay = isBlockDisplay;
|
|
158
|
+
// Shared selector for math elements
|
|
159
|
+
exports.mathSelectors = [
|
|
160
|
+
// WordPress LaTeX images
|
|
161
|
+
'img.latex[src*="latex.php"]',
|
|
162
|
+
// MathJax elements (v2 and v3)
|
|
163
|
+
'span.MathJax',
|
|
164
|
+
'mjx-container',
|
|
165
|
+
'script[type="math/tex"]',
|
|
166
|
+
'script[type="math/tex; mode=display"]',
|
|
167
|
+
'.MathJax_Preview + script[type="math/tex"]',
|
|
168
|
+
'.MathJax_Display',
|
|
169
|
+
'.MathJax_SVG',
|
|
170
|
+
'.MathJax_MathML',
|
|
171
|
+
// MediaWiki math elements
|
|
172
|
+
'.mwe-math-element',
|
|
173
|
+
'.mwe-math-fallback-image-inline',
|
|
174
|
+
'.mwe-math-fallback-image-display',
|
|
175
|
+
'.mwe-math-mathml-inline',
|
|
176
|
+
'.mwe-math-mathml-display',
|
|
177
|
+
// KaTeX elements
|
|
178
|
+
'.katex',
|
|
179
|
+
'.katex-display',
|
|
180
|
+
'.katex-mathml',
|
|
181
|
+
'.katex-html',
|
|
182
|
+
'[data-katex]',
|
|
183
|
+
'script[type="math/katex"]',
|
|
184
|
+
// Generic math elements and other formats
|
|
185
|
+
'math',
|
|
186
|
+
'[data-math]',
|
|
187
|
+
'[data-latex]',
|
|
188
|
+
'[data-tex]',
|
|
189
|
+
'script[type^="math/"]',
|
|
190
|
+
'annotation[encoding="application/x-tex"]'
|
|
191
|
+
].join(',');
|
|
192
|
+
//# sourceMappingURL=math.base.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"math.base.js","sourceRoot":"","sources":["../../src/elements/math.base.ts"],"names":[],"mappings":";;;AAMO,MAAM,oBAAoB,GAAG,CAAC,EAAW,EAAmB,EAAE;IACpE,2BAA2B;IAC3B,IAAI,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,MAAM,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,OAAO,CAAC;QACvD,OAAO;YACN,MAAM,EAAE,EAAE,CAAC,SAAS;YACpB,KAAK,EAAE,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,IAAI;YACzC,OAAO;SACP,CAAC;IACH,CAAC;IAED,qCAAqC;IACrC,MAAM,SAAS,GAAG,EAAE,CAAC,YAAY,CAAC,aAAa,CAAC,CAAC;IACjD,IAAI,SAAS,EAAE,CAAC;QACf,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QAC9C,OAAO,CAAC,SAAS,GAAG,SAAS,CAAC;QAC9B,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAClD,IAAI,WAAW,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,WAAW,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,OAAO,CAAC;YAChE,OAAO;gBACN,MAAM,EAAE,WAAW,CAAC,SAAS;gBAC7B,KAAK,EAAE,WAAW,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,IAAI;gBAClD,OAAO;aACP,CAAC;QACH,CAAC;IACF,CAAC;IAED,8BAA8B;IAC9B,MAAM,qBAAqB,GAAG,EAAE,CAAC,aAAa,CAAC,0CAA0C,CAAC,CAAC;IAE3F,IAAI,qBAAqB,EAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,qBAAqB,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAEhE,IAAI,WAAW,EAAE,CAAC;YACjB,6DAA6D;YAC7D,MAAM,eAAe,GAAG,WAAW,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YAC5D,MAAM,oBAAoB,GAAG,qBAAqB,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YAC3E,MAAM,OAAO,GAAG,eAAe,KAAK,OAAO,IAAI,oBAAoB,KAAK,OAAO,CAAC;YAEhF,OAAO;gBACN,MAAM,EAAE,WAAW,CAAC,SAAS;gBAC7B,KAAK,EAAE,WAAW,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,IAAI;gBAClD,OAAO;aACP,CAAC;QACH,CAAC;IACF,CAAC;IAED,kBAAkB;IAClB,MAAM,WAAW,GAAG,EAAE,CAAC,aAAa,CAAC,oBAAoB,CAAC,CAAC;IAC3D,IAAI,WAAW,EAAE,CAAC;QACjB,OAAO;YACN,MAAM,EAAE,WAAW,CAAC,SAAS;YAC7B,KAAK,EAAE,IAAI,EAAE,uCAAuC;YACpD,OAAO,EAAE,KAAK,CAAC,sCAAsC;SACrD,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACb,CAAC,CAAC;AA1DW,QAAA,oBAAoB,wBA0D/B;AAEK,MAAM,wBAAwB,GAAG,CAAC,EAAW,EAAiB,EAAE;IACtE,8BAA8B;IAC9B,MAAM,SAAS,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;IAChD,IAAI,SAAS,EAAE,CAAC;QACf,OAAO,SAAS,CAAC;IAClB,CAAC;IAED,yBAAyB;IACzB,IAAI,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,KAAK,IAAI,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1E,qCAAqC;QACrC,MAAM,QAAQ,GAAG,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QACxC,IAAI,QAAQ,EAAE,CAAC;YACd,OAAO,QAAQ,CAAC;QACjB,CAAC;QAED,kCAAkC;QAClC,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QACnC,IAAI,GAAG,EAAE,CAAC;YACT,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAC;YACrD,IAAI,KAAK,EAAE,CAAC;gBACX,OAAO,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;qBACjC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,wBAAwB;qBAC5C,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,0BAA0B;YACpD,CAAC;QACF,CAAC;IACF,CAAC;IAED,sBAAsB;IACtB,MAAM,UAAU,GAAG,EAAE,CAAC,aAAa,CAAC,0CAA0C,CAAC,CAAC;IAChF,IAAI,UAAU,EAAE,WAAW,EAAE,CAAC;QAC7B,OAAO,UAAU,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;IACtC,CAAC;IAED,gBAAgB;IAChB,IAAI,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC1B,MAAM,eAAe,GAAG,EAAE,CAAC,aAAa,CAAC,wDAAwD,CAAC,CAAC;QACnG,IAAI,eAAe,EAAE,WAAW,EAAE,CAAC;YAClC,OAAO,eAAe,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;QAC3C,CAAC;IACF,CAAC;IAED,kBAAkB;IAClB,IAAI,EAAE,CAAC,OAAO,CAAC,yBAAyB,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,uCAAuC,CAAC,EAAE,CAAC;QAClG,OAAO,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,IAAI,CAAC;IACvC,CAAC;IAED,mCAAmC;IACnC,IAAI,EAAE,CAAC,aAAa,EAAE,CAAC;QACtB,MAAM,aAAa,GAAG,EAAE,CAAC,aAAa,CAAC,aAAa,CAAC,gEAAgE,CAAC,CAAC;QACvH,IAAI,aAAa,EAAE,CAAC;YACnB,OAAO,aAAa,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,IAAI,CAAC;QAClD,CAAC;IACF,CAAC;IAED,uCAAuC;IACvC,OAAO,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,IAAI,CAAC;AACjE,CAAC,CAAC;AAxDW,QAAA,wBAAwB,4BAwDnC;AAEK,MAAM,cAAc,GAAG,CAAC,EAAW,EAAW,EAAE;IACtD,mCAAmC;IACnC,MAAM,WAAW,GAAG,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;IAC/C,IAAI,WAAW,KAAK,OAAO,EAAE,CAAC;QAC7B,OAAO,IAAI,CAAC;IACb,CAAC;IAED,2BAA2B;IAC3B,MAAM,UAAU,GAAG,EAAE,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC;IAC9C,IAAI,UAAU,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QACpE,OAAO,IAAI,CAAC;IACb,CAAC;IAED,0BAA0B;IAC1B,MAAM,SAAS,GAAG,EAAE,CAAC,OAAO,CAAC,0DAA0D,CAAC,CAAC;IACzF,IAAI,SAAS,EAAE,CAAC;QACf,OAAO,IAAI,CAAC;IACb,CAAC;IAED,qCAAqC;IACrC,MAAM,WAAW,GAAG,EAAE,CAAC,sBAAsB,CAAC;IAC9C,IAAI,WAAW,EAAE,OAAO,CAAC,WAAW,EAAE,KAAK,GAAG,EAAE,CAAC;QAChD,OAAO,IAAI,CAAC;IACb,CAAC;IAED,yBAAyB;IACzB,IAAI,EAAE,CAAC,OAAO,CAAC,kCAAkC,CAAC,EAAE,CAAC;QACpD,OAAO,IAAI,CAAC;IACb,CAAC;IAED,2BAA2B;IAC3B,IAAI,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC1B,uCAAuC;QACvC,6CAA6C;QAC7C,OAAO,EAAE,CAAC,OAAO,CAAC,gBAAgB,CAAC,KAAK,IAAI,CAAC;IAC9C,CAAC;IAED,qCAAqC;IACrC,IAAI,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,MAAM,CAAC;IAC9C,CAAC;IAED,yCAAyC;IACzC,IAAI,EAAE,CAAC,OAAO,CAAC,uCAAuC,CAAC,EAAE,CAAC;QACzD,OAAO,IAAI,CAAC;IACb,CAAC;IAED,IAAI,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,MAAM,CAAC;IAC9C,CAAC;IAED,2CAA2C;IAC3C,MAAM,eAAe,GAAG,EAAE,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;IAChD,IAAI,eAAe,EAAE,CAAC;QACrB,OAAO,eAAe,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,MAAM,CAAC;IAC3D,CAAC;IAED,OAAO,KAAK,CAAC;AACd,CAAC,CAAC;AA1DW,QAAA,cAAc,kBA0DzB;AAEF,oCAAoC;AACvB,QAAA,aAAa,GAAG;IAC5B,yBAAyB;IACzB,6BAA6B;IAE7B,+BAA+B;IAC/B,cAAc;IACd,eAAe;IACf,yBAAyB;IACzB,uCAAuC;IACvC,4CAA4C;IAC5C,kBAAkB;IAClB,cAAc;IACd,iBAAiB;IAEjB,0BAA0B;IAC1B,mBAAmB;IACnB,iCAAiC;IACjC,kCAAkC;IAClC,yBAAyB;IACzB,0BAA0B;IAE1B,iBAAiB;IACjB,QAAQ;IACR,gBAAgB;IAChB,eAAe;IACf,aAAa;IACb,cAAc;IACd,2BAA2B;IAE3B,0CAA0C;IAC1C,MAAM;IACN,aAAa;IACb,cAAc;IACd,YAAY;IACZ,uBAAuB;IACvB,0CAA0C;CAC1C,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { MathData } from './math.base';
|
|
2
2
|
export declare const getLatexFromElement: (el: Element) => string | null;
|
|
3
|
-
export declare const createCleanMathEl: (mathData: MathData | null, latex: string | null, isBlock: boolean) => Element;
|
|
3
|
+
export declare const createCleanMathEl: (mathData: MathData | null, latex: string | null, isBlock: boolean, doc: Document) => Element;
|
|
4
4
|
export declare const mathRules: {
|
|
5
5
|
selector: string;
|
|
6
6
|
element: string;
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.mathRules = exports.createCleanMathEl = exports.getLatexFromElement = void 0;
|
|
37
|
+
const mathml_to_latex_1 = require("mathml-to-latex");
|
|
38
|
+
const temml = __importStar(require("temml"));
|
|
39
|
+
const math_base_1 = require("./math.base");
|
|
40
|
+
const getLatexFromElement = (el) => {
|
|
41
|
+
// First try basic LaTeX extraction
|
|
42
|
+
const basicLatex = (0, math_base_1.getBasicLatexFromElement)(el);
|
|
43
|
+
if (basicLatex) {
|
|
44
|
+
return basicLatex;
|
|
45
|
+
}
|
|
46
|
+
// If no LaTeX found but we have MathML, convert it
|
|
47
|
+
const mathData = (0, math_base_1.getMathMLFromElement)(el);
|
|
48
|
+
if (mathData?.mathml) {
|
|
49
|
+
try {
|
|
50
|
+
return mathml_to_latex_1.MathMLToLaTeX.convert(mathData.mathml);
|
|
51
|
+
}
|
|
52
|
+
catch (e) {
|
|
53
|
+
console.warn('Failed to convert MathML to LaTeX:', e);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return null;
|
|
57
|
+
};
|
|
58
|
+
exports.getLatexFromElement = getLatexFromElement;
|
|
59
|
+
const createCleanMathEl = (mathData, latex, isBlock, doc) => {
|
|
60
|
+
const cleanMathEl = doc.createElement('math');
|
|
61
|
+
cleanMathEl.setAttribute('xmlns', 'http://www.w3.org/1998/Math/MathML');
|
|
62
|
+
cleanMathEl.setAttribute('display', isBlock ? 'block' : 'inline');
|
|
63
|
+
cleanMathEl.setAttribute('data-latex', latex || '');
|
|
64
|
+
// First try to use existing MathML content
|
|
65
|
+
if (mathData?.mathml) {
|
|
66
|
+
const tempDiv = doc.createElement('div');
|
|
67
|
+
tempDiv.innerHTML = mathData.mathml;
|
|
68
|
+
const mathContent = tempDiv.querySelector('math');
|
|
69
|
+
if (mathContent) {
|
|
70
|
+
cleanMathEl.innerHTML = mathContent.innerHTML;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// If no MathML but we have LaTeX, convert it
|
|
74
|
+
else if (latex) {
|
|
75
|
+
try {
|
|
76
|
+
const mathml = temml.renderToString(latex, {
|
|
77
|
+
displayMode: isBlock,
|
|
78
|
+
throwOnError: false
|
|
79
|
+
});
|
|
80
|
+
const tempDiv = doc.createElement('div');
|
|
81
|
+
tempDiv.innerHTML = mathml;
|
|
82
|
+
const mathContent = tempDiv.querySelector('math');
|
|
83
|
+
if (mathContent) {
|
|
84
|
+
cleanMathEl.innerHTML = mathContent.innerHTML;
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
cleanMathEl.textContent = latex; // Fallback to LaTeX as text
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
catch (e) {
|
|
91
|
+
console.warn('Failed to convert LaTeX to MathML:', e);
|
|
92
|
+
cleanMathEl.textContent = latex; // Fallback to LaTeX as text
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return cleanMathEl;
|
|
96
|
+
};
|
|
97
|
+
exports.createCleanMathEl = createCleanMathEl;
|
|
98
|
+
exports.mathRules = [
|
|
99
|
+
{
|
|
100
|
+
selector: math_base_1.mathSelectors,
|
|
101
|
+
element: 'math',
|
|
102
|
+
transform: (el) => {
|
|
103
|
+
// Check if element is an HTMLElement by checking for common properties
|
|
104
|
+
if (!('style' in el) || !('className' in el)) {
|
|
105
|
+
return el;
|
|
106
|
+
}
|
|
107
|
+
const mathData = (0, math_base_1.getMathMLFromElement)(el);
|
|
108
|
+
const latex = (0, exports.getLatexFromElement)(el);
|
|
109
|
+
const isBlock = (0, math_base_1.isBlockDisplay)(el);
|
|
110
|
+
const cleanMathEl = (0, exports.createCleanMathEl)(mathData, latex, isBlock, el.ownerDocument);
|
|
111
|
+
// Clean up any associated math scripts after we've extracted their content
|
|
112
|
+
if (el.parentElement) {
|
|
113
|
+
// Remove all math-related scripts and previews
|
|
114
|
+
const mathElements = el.parentElement.querySelectorAll('script[type^="math/"], .MathJax_Preview, script[type="text/javascript"][src*="mathjax"], script[type="text/javascript"][src*="katex"]');
|
|
115
|
+
mathElements.forEach(el => el.remove());
|
|
116
|
+
}
|
|
117
|
+
return cleanMathEl;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
];
|
|
121
|
+
//# sourceMappingURL=math.full.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"math.full.js","sourceRoot":"","sources":["../../src/elements/math.full.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,qDAAgD;AAChD,6CAA+B;AAC/B,2CAMqB;AAEd,MAAM,mBAAmB,GAAG,CAAC,EAAW,EAAiB,EAAE;IACjE,mCAAmC;IACnC,MAAM,UAAU,GAAG,IAAA,oCAAwB,EAAC,EAAE,CAAC,CAAC;IAChD,IAAI,UAAU,EAAE,CAAC;QAChB,OAAO,UAAU,CAAC;IACnB,CAAC;IAED,mDAAmD;IACnD,MAAM,QAAQ,GAAG,IAAA,gCAAoB,EAAC,EAAE,CAAC,CAAC;IAC1C,IAAI,QAAQ,EAAE,MAAM,EAAE,CAAC;QACtB,IAAI,CAAC;YACJ,OAAO,+BAAa,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC/C,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,oCAAoC,EAAE,CAAC,CAAC,CAAC;QACvD,CAAC;IACF,CAAC;IAED,OAAO,IAAI,CAAC;AACb,CAAC,CAAC;AAlBW,QAAA,mBAAmB,uBAkB9B;AAEK,MAAM,iBAAiB,GAAG,CAAC,QAAyB,EAAE,KAAoB,EAAE,OAAgB,EAAE,GAAa,EAAW,EAAE;IAC9H,MAAM,WAAW,GAAG,GAAG,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;IAE9C,WAAW,CAAC,YAAY,CAAC,OAAO,EAAE,oCAAoC,CAAC,CAAC;IACxE,WAAW,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAClE,WAAW,CAAC,YAAY,CAAC,YAAY,EAAE,KAAK,IAAI,EAAE,CAAC,CAAC;IAEpD,2CAA2C;IAC3C,IAAI,QAAQ,EAAE,MAAM,EAAE,CAAC;QACtB,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACzC,OAAO,CAAC,SAAS,GAAG,QAAQ,CAAC,MAAM,CAAC;QACpC,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAClD,IAAI,WAAW,EAAE,CAAC;YACjB,WAAW,CAAC,SAAS,GAAG,WAAW,CAAC,SAAS,CAAC;QAC/C,CAAC;IACF,CAAC;IACD,6CAA6C;SACxC,IAAI,KAAK,EAAE,CAAC;QAChB,IAAI,CAAC;YACJ,MAAM,MAAM,GAAG,KAAK,CAAC,cAAc,CAAC,KAAK,EAAE;gBAC1C,WAAW,EAAE,OAAO;gBACpB,YAAY,EAAE,KAAK;aACnB,CAAC,CAAC;YACH,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YACzC,OAAO,CAAC,SAAS,GAAG,MAAM,CAAC;YAC3B,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAClD,IAAI,WAAW,EAAE,CAAC;gBACjB,WAAW,CAAC,SAAS,GAAG,WAAW,CAAC,SAAS,CAAC;YAC/C,CAAC;iBAAM,CAAC;gBACP,WAAW,CAAC,WAAW,GAAG,KAAK,CAAC,CAAC,4BAA4B;YAC9D,CAAC;QACF,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,oCAAoC,EAAE,CAAC,CAAC,CAAC;YACtD,WAAW,CAAC,WAAW,GAAG,KAAK,CAAC,CAAC,4BAA4B;QAC9D,CAAC;IACF,CAAC;IAED,OAAO,WAAW,CAAC;AACpB,CAAC,CAAC;AAtCW,QAAA,iBAAiB,qBAsC5B;AAEW,QAAA,SAAS,GAAG;IACxB;QACC,QAAQ,EAAE,yBAAa;QACvB,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,CAAC,EAAW,EAAW,EAAE;YACnC,uEAAuE;YACvE,IAAI,CAAC,CAAC,OAAO,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,WAAW,IAAI,EAAE,CAAC,EAAE,CAAC;gBAC9C,OAAO,EAAE,CAAC;YACX,CAAC;YAED,MAAM,QAAQ,GAAG,IAAA,gCAAoB,EAAC,EAAE,CAAC,CAAC;YAC1C,MAAM,KAAK,GAAG,IAAA,2BAAmB,EAAC,EAAE,CAAC,CAAC;YACtC,MAAM,OAAO,GAAG,IAAA,0BAAc,EAAC,EAAE,CAAC,CAAC;YACnC,MAAM,WAAW,GAAG,IAAA,yBAAiB,EAAC,QAAQ,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE,CAAC,aAAa,CAAC,CAAC;YAElF,2EAA2E;YAC3E,IAAI,EAAE,CAAC,aAAa,EAAE,CAAC;gBACtB,+CAA+C;gBAC/C,MAAM,YAAY,GAAG,EAAE,CAAC,aAAa,CAAC,gBAAgB,CACrD,uIAAuI,CACvI,CAAC;gBACF,YAAY,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;YACzC,CAAC;YAED,OAAO,WAAW,CAAC;QACpB,CAAC;KACD;CACD,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { BaseExtractor } from './extractors/_base';
|
|
2
|
+
type ExtractorConstructor = new (document: Document, url: string, schemaOrgData?: any) => BaseExtractor;
|
|
3
|
+
interface ExtractorMapping {
|
|
4
|
+
patterns: (string | RegExp)[];
|
|
5
|
+
extractor: ExtractorConstructor;
|
|
6
|
+
}
|
|
7
|
+
export declare class ExtractorRegistry {
|
|
8
|
+
private static mappings;
|
|
9
|
+
private static domainCache;
|
|
10
|
+
static initialize(): void;
|
|
11
|
+
static register(mapping: ExtractorMapping): void;
|
|
12
|
+
static findExtractor(document: Document, url: string, schemaOrgData?: any): BaseExtractor | null;
|
|
13
|
+
static clearCache(): void;
|
|
14
|
+
}
|
|
15
|
+
export {};
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ExtractorRegistry = void 0;
|
|
4
|
+
// Extractors
|
|
5
|
+
const reddit_1 = require("./extractors/reddit");
|
|
6
|
+
const twitter_1 = require("./extractors/twitter");
|
|
7
|
+
const youtube_1 = require("./extractors/youtube");
|
|
8
|
+
const hackernews_1 = require("./extractors/hackernews");
|
|
9
|
+
const chatgpt_1 = require("./extractors/chatgpt");
|
|
10
|
+
const claude_1 = require("./extractors/claude");
|
|
11
|
+
class ExtractorRegistry {
|
|
12
|
+
static initialize() {
|
|
13
|
+
// Register all extractors with their URL patterns
|
|
14
|
+
this.register({
|
|
15
|
+
patterns: [
|
|
16
|
+
'twitter.com',
|
|
17
|
+
/\/x\.com\/.*/,
|
|
18
|
+
],
|
|
19
|
+
extractor: twitter_1.TwitterExtractor
|
|
20
|
+
});
|
|
21
|
+
this.register({
|
|
22
|
+
patterns: [
|
|
23
|
+
'reddit.com',
|
|
24
|
+
'old.reddit.com',
|
|
25
|
+
'new.reddit.com',
|
|
26
|
+
/^https:\/\/[^\/]+\.reddit\.com/
|
|
27
|
+
],
|
|
28
|
+
extractor: reddit_1.RedditExtractor
|
|
29
|
+
});
|
|
30
|
+
this.register({
|
|
31
|
+
patterns: [
|
|
32
|
+
'youtube.com',
|
|
33
|
+
'youtu.be',
|
|
34
|
+
/youtube\.com\/watch\?v=.*/,
|
|
35
|
+
/youtu\.be\/.*/
|
|
36
|
+
],
|
|
37
|
+
extractor: youtube_1.YoutubeExtractor
|
|
38
|
+
});
|
|
39
|
+
this.register({
|
|
40
|
+
patterns: [
|
|
41
|
+
/news\.ycombinator\.com\/item\?id=.*/
|
|
42
|
+
],
|
|
43
|
+
extractor: hackernews_1.HackerNewsExtractor
|
|
44
|
+
});
|
|
45
|
+
this.register({
|
|
46
|
+
patterns: [
|
|
47
|
+
/^https?:\/\/chatgpt\.com\/(c|share)\/.*/
|
|
48
|
+
],
|
|
49
|
+
extractor: chatgpt_1.ChatGPTExtractor
|
|
50
|
+
});
|
|
51
|
+
this.register({
|
|
52
|
+
patterns: [
|
|
53
|
+
/^https?:\/\/claude\.ai\/(chat|share)\/.*/
|
|
54
|
+
],
|
|
55
|
+
extractor: claude_1.ClaudeExtractor
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
static register(mapping) {
|
|
59
|
+
this.mappings.push(mapping);
|
|
60
|
+
}
|
|
61
|
+
static findExtractor(document, url, schemaOrgData) {
|
|
62
|
+
try {
|
|
63
|
+
const domain = new URL(url).hostname;
|
|
64
|
+
// Check cache first
|
|
65
|
+
if (this.domainCache.has(domain)) {
|
|
66
|
+
const cachedExtractor = this.domainCache.get(domain);
|
|
67
|
+
return cachedExtractor ? new cachedExtractor(document, url, schemaOrgData) : null;
|
|
68
|
+
}
|
|
69
|
+
// Find matching extractor
|
|
70
|
+
for (const { patterns, extractor } of this.mappings) {
|
|
71
|
+
const matches = patterns.some(pattern => {
|
|
72
|
+
if (pattern instanceof RegExp) {
|
|
73
|
+
return pattern.test(url);
|
|
74
|
+
}
|
|
75
|
+
return domain.includes(pattern);
|
|
76
|
+
});
|
|
77
|
+
if (matches) {
|
|
78
|
+
// Cache the result
|
|
79
|
+
this.domainCache.set(domain, extractor);
|
|
80
|
+
return new extractor(document, url, schemaOrgData);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// Cache the negative result
|
|
84
|
+
this.domainCache.set(domain, null);
|
|
85
|
+
return null;
|
|
86
|
+
}
|
|
87
|
+
catch (error) {
|
|
88
|
+
console.error('Error in findExtractor:', error);
|
|
89
|
+
return null;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
static clearCache() {
|
|
93
|
+
this.domainCache.clear();
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
exports.ExtractorRegistry = ExtractorRegistry;
|
|
97
|
+
ExtractorRegistry.mappings = [];
|
|
98
|
+
ExtractorRegistry.domainCache = new Map();
|
|
99
|
+
// Initialize extractors
|
|
100
|
+
ExtractorRegistry.initialize();
|
|
101
|
+
//# sourceMappingURL=extractor-registry.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor-registry.js","sourceRoot":"","sources":["../src/extractor-registry.ts"],"names":[],"mappings":";;;AAEA,aAAa;AACb,gDAAsD;AACtD,kDAAwD;AACxD,kDAAwD;AACxD,wDAA8D;AAC9D,kDAAwD;AACxD,gDAAsD;AAStD,MAAa,iBAAiB;IAI7B,MAAM,CAAC,UAAU;QAChB,kDAAkD;QAClD,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,aAAa;gBACb,cAAc;aACd;YACD,SAAS,EAAE,0BAAgB;SAC3B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,YAAY;gBACZ,gBAAgB;gBAChB,gBAAgB;gBAChB,gCAAgC;aAChC;YACD,SAAS,EAAE,wBAAe;SAC1B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,aAAa;gBACb,UAAU;gBACV,2BAA2B;gBAC3B,eAAe;aACf;YACD,SAAS,EAAE,0BAAgB;SAC3B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,qCAAqC;aACrC;YACD,SAAS,EAAE,gCAAmB;SAC9B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,yCAAyC;aACzC;YACD,SAAS,EAAE,0BAAgB;SAC3B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,0CAA0C;aAC1C;YACD,SAAS,EAAE,wBAAe;SAC1B,CAAC,CAAC;IACJ,CAAC;IAED,MAAM,CAAC,QAAQ,CAAC,OAAyB;QACxC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;IAED,MAAM,CAAC,aAAa,CAAC,QAAkB,EAAE,GAAW,EAAE,aAAmB;QACxE,IAAI,CAAC;YACJ,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YAErC,oBAAoB;YACpB,IAAI,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;gBAClC,MAAM,eAAe,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;gBACrD,OAAO,eAAe,CAAC,CAAC,CAAC,IAAI,eAAe,CAAC,QAAQ,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YACnF,CAAC;YAED,0BAA0B;YAC1B,KAAK,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACrD,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE;oBACvC,IAAI,OAAO,YAAY,MAAM,EAAE,CAAC;wBAC/B,OAAO,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBAC1B,CAAC;oBACD,OAAO,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;gBACjC,CAAC,CAAC,CAAC;gBAEH,IAAI,OAAO,EAAE,CAAC;oBACb,mBAAmB;oBACnB,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;oBACxC,OAAO,IAAI,SAAS,CAAC,QAAQ,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;gBACpD,CAAC;YACF,CAAC;YAED,4BAA4B;YAC5B,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;YACnC,OAAO,IAAI,CAAC;QAEb,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,OAAO,CAAC,KAAK,CAAC,yBAAyB,EAAE,KAAK,CAAC,CAAC;YAChD,OAAO,IAAI,CAAC;QACb,CAAC;IACF,CAAC;IAED,MAAM,CAAC,UAAU;QAChB,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;IAC1B,CAAC;;AAlGF,8CAmGC;AAlGe,0BAAQ,GAAuB,EAAE,CAAC;AAClC,6BAAW,GAA6C,IAAI,GAAG,EAAE,CAAC;AAmGlF,wBAAwB;AACxB,iBAAiB,CAAC,UAAU,EAAE,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { ExtractorResult } from '../types/extractors';
|
|
2
|
+
export declare abstract class BaseExtractor {
|
|
3
|
+
protected document: Document;
|
|
4
|
+
protected url: string;
|
|
5
|
+
protected schemaOrgData?: any;
|
|
6
|
+
constructor(document: Document, url: string, schemaOrgData?: any);
|
|
7
|
+
abstract canExtract(): boolean;
|
|
8
|
+
abstract extract(): ExtractorResult;
|
|
9
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.BaseExtractor = void 0;
|
|
4
|
+
class BaseExtractor {
|
|
5
|
+
constructor(document, url, schemaOrgData) {
|
|
6
|
+
this.document = document;
|
|
7
|
+
this.url = url;
|
|
8
|
+
this.schemaOrgData = schemaOrgData;
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
exports.BaseExtractor = BaseExtractor;
|
|
12
|
+
//# sourceMappingURL=_base.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"_base.js","sourceRoot":"","sources":["../../src/extractors/_base.ts"],"names":[],"mappings":";;;AAEA,MAAsB,aAAa;IAKlC,YAAY,QAAkB,EAAE,GAAW,EAAE,aAAmB;QAC/D,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC;QACf,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC;IACpC,CAAC;CAID;AAbD,sCAaC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { BaseExtractor } from './_base';
|
|
2
|
+
import { ConversationMessage, ConversationMetadata, Footnote, ExtractorResult } from '../types/extractors';
|
|
3
|
+
export declare abstract class ConversationExtractor extends BaseExtractor {
|
|
4
|
+
protected abstract extractMessages(): ConversationMessage[];
|
|
5
|
+
protected abstract getMetadata(): ConversationMetadata;
|
|
6
|
+
protected getFootnotes(): Footnote[];
|
|
7
|
+
extract(): ExtractorResult;
|
|
8
|
+
protected createContentHtml(messages: ConversationMessage[], footnotes: Footnote[]): string;
|
|
9
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ConversationExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const defuddle_1 = require("../defuddle");
|
|
6
|
+
class ConversationExtractor extends _base_1.BaseExtractor {
|
|
7
|
+
getFootnotes() {
|
|
8
|
+
return [];
|
|
9
|
+
}
|
|
10
|
+
extract() {
|
|
11
|
+
const messages = this.extractMessages();
|
|
12
|
+
const metadata = this.getMetadata();
|
|
13
|
+
const footnotes = this.getFootnotes();
|
|
14
|
+
const rawContentHtml = this.createContentHtml(messages, footnotes);
|
|
15
|
+
// Create a temporary document to run Defuddle on our content
|
|
16
|
+
const tempDoc = document.implementation.createHTMLDocument();
|
|
17
|
+
const container = tempDoc.createElement('article');
|
|
18
|
+
container.innerHTML = rawContentHtml;
|
|
19
|
+
tempDoc.body.appendChild(container);
|
|
20
|
+
// Run Defuddle on our formatted content
|
|
21
|
+
const defuddled = new defuddle_1.Defuddle(tempDoc).parse();
|
|
22
|
+
const contentHtml = defuddled.content;
|
|
23
|
+
return {
|
|
24
|
+
content: contentHtml,
|
|
25
|
+
contentHtml: contentHtml,
|
|
26
|
+
extractedContent: {
|
|
27
|
+
messageCount: messages.length.toString(),
|
|
28
|
+
},
|
|
29
|
+
variables: {
|
|
30
|
+
title: metadata.title || 'Conversation',
|
|
31
|
+
site: metadata.site,
|
|
32
|
+
description: metadata.description || `${metadata.site} conversation with ${messages.length} messages`,
|
|
33
|
+
wordCount: defuddled.wordCount?.toString() || '',
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
createContentHtml(messages, footnotes) {
|
|
38
|
+
const messagesHtml = messages.map((message, index) => {
|
|
39
|
+
const timestampHtml = message.timestamp ?
|
|
40
|
+
`<div class="message-timestamp">${message.timestamp}</div>` : '';
|
|
41
|
+
// Check if content already has paragraph tags
|
|
42
|
+
const hasParagraphs = /<p[^>]*>[\s\S]*?<\/p>/i.test(message.content);
|
|
43
|
+
const contentHtml = hasParagraphs ? message.content : `<p>${message.content}</p>`;
|
|
44
|
+
// Add metadata to data attributes
|
|
45
|
+
const dataAttributes = message.metadata ?
|
|
46
|
+
Object.entries(message.metadata)
|
|
47
|
+
.map(([key, value]) => `data-${key}="${value}"`)
|
|
48
|
+
.join(' ') : '';
|
|
49
|
+
return `
|
|
50
|
+
<div class="message message-${message.author.toLowerCase()}" ${dataAttributes}>
|
|
51
|
+
<div class="message-header">
|
|
52
|
+
<p class="message-author"><strong>${message.author}</strong></p>
|
|
53
|
+
${timestampHtml}
|
|
54
|
+
</div>
|
|
55
|
+
<div class="message-content">
|
|
56
|
+
${contentHtml}
|
|
57
|
+
</div>
|
|
58
|
+
</div>${index < messages.length - 1 ? '\n<hr>' : ''}`;
|
|
59
|
+
}).join('\n').trim();
|
|
60
|
+
// Add footnotes section if we have any
|
|
61
|
+
const footnotesHtml = footnotes.length > 0 ? `
|
|
62
|
+
<div id="footnotes">
|
|
63
|
+
<ol>
|
|
64
|
+
${footnotes.map((footnote, index) => `
|
|
65
|
+
<li class="footnote" id="fn:${index + 1}">
|
|
66
|
+
<p>
|
|
67
|
+
<a href="${footnote.url}" target="_blank">${footnote.text}</a> <a href="#fnref:${index + 1}" class="footnote-backref">↩</a>
|
|
68
|
+
</p>
|
|
69
|
+
</li>
|
|
70
|
+
`).join('')}
|
|
71
|
+
</ol>
|
|
72
|
+
</div>` : '';
|
|
73
|
+
return `${messagesHtml}\n${footnotesHtml}`.trim();
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
exports.ConversationExtractor = ConversationExtractor;
|
|
77
|
+
//# sourceMappingURL=_conversation.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"_conversation.js","sourceRoot":"","sources":["../../src/extractors/_conversation.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,0CAAuC;AAEvC,MAAsB,qBAAsB,SAAQ,qBAAa;IAGtD,YAAY;QACrB,OAAO,EAAE,CAAC;IACX,CAAC;IAED,OAAO;QACN,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,EAAE,CAAC;QACxC,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,cAAc,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAEnE,6DAA6D;QAC7D,MAAM,OAAO,GAAG,QAAQ,CAAC,cAAc,CAAC,kBAAkB,EAAE,CAAC;QAC7D,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;QACnD,SAAS,CAAC,SAAS,GAAG,cAAc,CAAC;QACrC,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;QAEpC,wCAAwC;QACxC,MAAM,SAAS,GAAG,IAAI,mBAAQ,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,CAAC;QAChD,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC;QAEtC,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,QAAQ,EAAE;aACxC;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,QAAQ,CAAC,KAAK,IAAI,cAAc;gBACvC,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,WAAW,EAAE,QAAQ,CAAC,WAAW,IAAI,GAAG,QAAQ,CAAC,IAAI,sBAAsB,QAAQ,CAAC,MAAM,WAAW;gBACrG,SAAS,EAAE,SAAS,CAAC,SAAS,EAAE,QAAQ,EAAE,IAAI,EAAE;aAChD;SACD,CAAC;IACH,CAAC;IAES,iBAAiB,CAAC,QAA+B,EAAE,SAAqB;QACjF,MAAM,YAAY,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE;YACpD,MAAM,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;gBACxC,kCAAkC,OAAO,CAAC,SAAS,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;YAElE,8CAA8C;YAC9C,MAAM,aAAa,GAAG,wBAAwB,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YACrE,MAAM,WAAW,GAAG,aAAa,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,CAAC,OAAO,MAAM,CAAC;YAElF,kCAAkC;YAClC,MAAM,cAAc,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;gBACxC,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC;qBAC9B,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,QAAQ,GAAG,KAAK,KAAK,GAAG,CAAC;qBAC/C,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAElB,OAAO;iCACuB,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,KAAK,cAAc;;yCAEvC,OAAO,CAAC,MAAM;OAChD,aAAa;;;OAGb,WAAW;;WAEP,KAAK,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;QACvD,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAErB,uCAAuC;QACvC,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;;;OAGxC,SAAS,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,EAAE,CAAC;oCACN,KAAK,GAAG,CAAC;;mBAE1B,QAAQ,CAAC,GAAG,qBAAqB,QAAQ,CAAC,IAAI,6BAA6B,KAAK,GAAG,CAAC;;;MAGjG,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;;UAEN,CAAC,CAAC,CAAC,EAAE,CAAC;QAEd,OAAO,GAAG,YAAY,KAAK,aAAa,EAAE,CAAC,IAAI,EAAE,CAAC;IACnD,CAAC;CACD;AAjFD,sDAiFC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { ConversationExtractor } from './_conversation';
|
|
2
|
+
import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
|
|
3
|
+
export declare class ChatGPTExtractor extends ConversationExtractor {
|
|
4
|
+
private articles;
|
|
5
|
+
private footnotes;
|
|
6
|
+
private footnoteCounter;
|
|
7
|
+
constructor(document: Document, url: string);
|
|
8
|
+
canExtract(): boolean;
|
|
9
|
+
protected extractMessages(): ConversationMessage[];
|
|
10
|
+
protected getFootnotes(): Footnote[];
|
|
11
|
+
protected getMetadata(): ConversationMetadata;
|
|
12
|
+
private getTitle;
|
|
13
|
+
}
|