defuddle 0.3.8 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +61 -48
  2. package/dist/constants.js +744 -0
  3. package/dist/constants.js.map +1 -0
  4. package/dist/defuddle.d.ts +3 -2
  5. package/dist/defuddle.js +1676 -0
  6. package/dist/defuddle.js.map +1 -0
  7. package/dist/elements/code.js +287 -0
  8. package/dist/elements/code.js.map +1 -0
  9. package/dist/elements/headings.js +95 -0
  10. package/dist/elements/headings.js.map +1 -0
  11. package/dist/elements/math.base.js +192 -0
  12. package/dist/elements/math.base.js.map +1 -0
  13. package/dist/elements/math.full.d.ts +1 -1
  14. package/dist/elements/math.full.js +121 -0
  15. package/dist/elements/math.full.js.map +1 -0
  16. package/dist/extractor-registry.d.ts +15 -0
  17. package/dist/extractor-registry.js +101 -0
  18. package/dist/extractor-registry.js.map +1 -0
  19. package/dist/extractors/_base.d.ts +9 -0
  20. package/dist/extractors/_base.js +12 -0
  21. package/dist/extractors/_base.js.map +1 -0
  22. package/dist/extractors/_conversation.d.ts +9 -0
  23. package/dist/extractors/_conversation.js +77 -0
  24. package/dist/extractors/_conversation.js.map +1 -0
  25. package/dist/extractors/chatgpt.d.ts +13 -0
  26. package/dist/extractors/chatgpt.js +142 -0
  27. package/dist/extractors/chatgpt.js.map +1 -0
  28. package/dist/extractors/claude.d.ts +10 -0
  29. package/dist/extractors/claude.js +87 -0
  30. package/dist/extractors/claude.js.map +1 -0
  31. package/dist/extractors/hackernews.d.ts +21 -0
  32. package/dist/extractors/hackernews.js +206 -0
  33. package/dist/extractors/hackernews.js.map +1 -0
  34. package/dist/extractors/reddit.d.ts +16 -0
  35. package/dist/extractors/reddit.js +143 -0
  36. package/dist/extractors/reddit.js.map +1 -0
  37. package/dist/extractors/twitter.d.ts +16 -0
  38. package/dist/extractors/twitter.js +199 -0
  39. package/dist/extractors/twitter.js.map +1 -0
  40. package/dist/extractors/youtube.d.ts +12 -0
  41. package/dist/extractors/youtube.js +53 -0
  42. package/dist/extractors/youtube.js.map +1 -0
  43. package/dist/index.full.js +19181 -1
  44. package/dist/index.full.js.map +1 -0
  45. package/dist/index.js +6 -1
  46. package/dist/index.js.map +1 -0
  47. package/dist/markdown.d.ts +1 -0
  48. package/dist/markdown.js +545 -0
  49. package/dist/markdown.js.map +1 -0
  50. package/dist/metadata.js +268 -0
  51. package/dist/metadata.js.map +1 -0
  52. package/dist/node.d.ts +12 -0
  53. package/dist/node.js +50 -0
  54. package/dist/node.js.map +1 -0
  55. package/dist/scoring.d.ts +8 -0
  56. package/dist/scoring.js +95 -0
  57. package/dist/scoring.js.map +1 -0
  58. package/dist/types/extractors.d.ts +41 -0
  59. package/dist/types/extractors.js +3 -0
  60. package/dist/types/extractors.js.map +1 -0
  61. package/dist/types.d.ts +14 -0
  62. package/dist/types.js +3 -0
  63. package/dist/types.js.map +1 -0
  64. package/package.json +19 -5
@@ -0,0 +1,192 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.mathSelectors = exports.isBlockDisplay = exports.getBasicLatexFromElement = exports.getMathMLFromElement = void 0;
4
+ const getMathMLFromElement = (el) => {
5
+ // 1. Direct MathML content
6
+ if (el.tagName.toLowerCase() === 'math') {
7
+ const isBlock = el.getAttribute('display') === 'block';
8
+ return {
9
+ mathml: el.outerHTML,
10
+ latex: el.getAttribute('alttext') || null,
11
+ isBlock
12
+ };
13
+ }
14
+ // 2. MathML in data-mathml attribute
15
+ const mathmlStr = el.getAttribute('data-mathml');
16
+ if (mathmlStr) {
17
+ const tempDiv = document.createElement('div');
18
+ tempDiv.innerHTML = mathmlStr;
19
+ const mathElement = tempDiv.querySelector('math');
20
+ if (mathElement) {
21
+ const isBlock = mathElement.getAttribute('display') === 'block';
22
+ return {
23
+ mathml: mathElement.outerHTML,
24
+ latex: mathElement.getAttribute('alttext') || null,
25
+ isBlock
26
+ };
27
+ }
28
+ }
29
+ // 3. MathJax assistive MathML
30
+ const assistiveMmlContainer = el.querySelector('.MJX_Assistive_MathML, mjx-assistive-mml');
31
+ if (assistiveMmlContainer) {
32
+ const mathElement = assistiveMmlContainer.querySelector('math');
33
+ if (mathElement) {
34
+ // Check both the math element and container for display mode
35
+ const mathDisplayAttr = mathElement.getAttribute('display');
36
+ const containerDisplayAttr = assistiveMmlContainer.getAttribute('display');
37
+ const isBlock = mathDisplayAttr === 'block' || containerDisplayAttr === 'block';
38
+ return {
39
+ mathml: mathElement.outerHTML,
40
+ latex: mathElement.getAttribute('alttext') || null,
41
+ isBlock
42
+ };
43
+ }
44
+ }
45
+ // 4. KaTeX MathML
46
+ const katexMathml = el.querySelector('.katex-mathml math');
47
+ if (katexMathml) {
48
+ return {
49
+ mathml: katexMathml.outerHTML,
50
+ latex: null, // We'll get LaTeX separately for KaTeX
51
+ isBlock: false // We'll determine this from container
52
+ };
53
+ }
54
+ return null;
55
+ };
56
+ exports.getMathMLFromElement = getMathMLFromElement;
57
+ const getBasicLatexFromElement = (el) => {
58
+ // Direct data-latex attribute
59
+ const dataLatex = el.getAttribute('data-latex');
60
+ if (dataLatex) {
61
+ return dataLatex;
62
+ }
63
+ // WordPress LaTeX images
64
+ if (el.tagName.toLowerCase() === 'img' && el.classList.contains('latex')) {
65
+ // Try alt text first as it's cleaner
66
+ const altLatex = el.getAttribute('alt');
67
+ if (altLatex) {
68
+ return altLatex;
69
+ }
70
+ // Fallback to extracting from URL
71
+ const src = el.getAttribute('src');
72
+ if (src) {
73
+ const match = src.match(/latex\.php\?latex=([^&]+)/);
74
+ if (match) {
75
+ return decodeURIComponent(match[1])
76
+ .replace(/\+/g, ' ') // Replace + with spaces
77
+ .replace(/%5C/g, '\\'); // Fix escaped backslashes
78
+ }
79
+ }
80
+ }
81
+ // LaTeX in annotation
82
+ const annotation = el.querySelector('annotation[encoding="application/x-tex"]');
83
+ if (annotation?.textContent) {
84
+ return annotation.textContent.trim();
85
+ }
86
+ // KaTeX formats
87
+ if (el.matches('.katex')) {
88
+ const katexAnnotation = el.querySelector('.katex-mathml annotation[encoding="application/x-tex"]');
89
+ if (katexAnnotation?.textContent) {
90
+ return katexAnnotation.textContent.trim();
91
+ }
92
+ }
93
+ // MathJax scripts
94
+ if (el.matches('script[type="math/tex"]') || el.matches('script[type="math/tex; mode=display"]')) {
95
+ return el.textContent?.trim() || null;
96
+ }
97
+ // Check for sibling script element
98
+ if (el.parentElement) {
99
+ const siblingScript = el.parentElement.querySelector('script[type="math/tex"], script[type="math/tex; mode=display"]');
100
+ if (siblingScript) {
101
+ return siblingScript.textContent?.trim() || null;
102
+ }
103
+ }
104
+ // Fallback to alt text or text content
105
+ return el.getAttribute('alt') || el.textContent?.trim() || null;
106
+ };
107
+ exports.getBasicLatexFromElement = getBasicLatexFromElement;
108
+ const isBlockDisplay = (el) => {
109
+ // Check explicit display attribute
110
+ const displayAttr = el.getAttribute('display');
111
+ if (displayAttr === 'block') {
112
+ return true;
113
+ }
114
+ // Check common class names
115
+ const classNames = el.className.toLowerCase();
116
+ if (classNames.includes('display') || classNames.includes('block')) {
117
+ return true;
118
+ }
119
+ // Check container classes
120
+ const container = el.closest('.katex-display, .MathJax_Display, [data-display="block"]');
121
+ if (container) {
122
+ return true;
123
+ }
124
+ // Check if preceded by block element
125
+ const prevElement = el.previousElementSibling;
126
+ if (prevElement?.tagName.toLowerCase() === 'p') {
127
+ return true;
128
+ }
129
+ // Check specific formats
130
+ if (el.matches('.mwe-math-fallback-image-display')) {
131
+ return true;
132
+ }
133
+ // Check KaTeX display mode
134
+ if (el.matches('.katex')) {
135
+ // KaTeX elements are inline by default
136
+ // Only block if explicitly marked as display
137
+ return el.closest('.katex-display') !== null;
138
+ }
139
+ // Check MathJax v3 display attribute
140
+ if (el.hasAttribute('display')) {
141
+ return el.getAttribute('display') === 'true';
142
+ }
143
+ // Check MathJax script display attribute
144
+ if (el.matches('script[type="math/tex; mode=display"]')) {
145
+ return true;
146
+ }
147
+ if (el.hasAttribute('display')) {
148
+ return el.getAttribute('display') === 'true';
149
+ }
150
+ // Check parent container display attribute
151
+ const parentContainer = el.closest('[display]');
152
+ if (parentContainer) {
153
+ return parentContainer.getAttribute('display') === 'true';
154
+ }
155
+ return false;
156
+ };
157
+ exports.isBlockDisplay = isBlockDisplay;
158
+ // Shared selector for math elements
159
+ exports.mathSelectors = [
160
+ // WordPress LaTeX images
161
+ 'img.latex[src*="latex.php"]',
162
+ // MathJax elements (v2 and v3)
163
+ 'span.MathJax',
164
+ 'mjx-container',
165
+ 'script[type="math/tex"]',
166
+ 'script[type="math/tex; mode=display"]',
167
+ '.MathJax_Preview + script[type="math/tex"]',
168
+ '.MathJax_Display',
169
+ '.MathJax_SVG',
170
+ '.MathJax_MathML',
171
+ // MediaWiki math elements
172
+ '.mwe-math-element',
173
+ '.mwe-math-fallback-image-inline',
174
+ '.mwe-math-fallback-image-display',
175
+ '.mwe-math-mathml-inline',
176
+ '.mwe-math-mathml-display',
177
+ // KaTeX elements
178
+ '.katex',
179
+ '.katex-display',
180
+ '.katex-mathml',
181
+ '.katex-html',
182
+ '[data-katex]',
183
+ 'script[type="math/katex"]',
184
+ // Generic math elements and other formats
185
+ 'math',
186
+ '[data-math]',
187
+ '[data-latex]',
188
+ '[data-tex]',
189
+ 'script[type^="math/"]',
190
+ 'annotation[encoding="application/x-tex"]'
191
+ ].join(',');
192
+ //# sourceMappingURL=math.base.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"math.base.js","sourceRoot":"","sources":["../../src/elements/math.base.ts"],"names":[],"mappings":";;;AAMO,MAAM,oBAAoB,GAAG,CAAC,EAAW,EAAmB,EAAE;IACpE,2BAA2B;IAC3B,IAAI,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,MAAM,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,OAAO,CAAC;QACvD,OAAO;YACN,MAAM,EAAE,EAAE,CAAC,SAAS;YACpB,KAAK,EAAE,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,IAAI;YACzC,OAAO;SACP,CAAC;IACH,CAAC;IAED,qCAAqC;IACrC,MAAM,SAAS,GAAG,EAAE,CAAC,YAAY,CAAC,aAAa,CAAC,CAAC;IACjD,IAAI,SAAS,EAAE,CAAC;QACf,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QAC9C,OAAO,CAAC,SAAS,GAAG,SAAS,CAAC;QAC9B,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAClD,IAAI,WAAW,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,WAAW,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,OAAO,CAAC;YAChE,OAAO;gBACN,MAAM,EAAE,WAAW,CAAC,SAAS;gBAC7B,KAAK,EAAE,WAAW,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,IAAI;gBAClD,OAAO;aACP,CAAC;QACH,CAAC;IACF,CAAC;IAED,8BAA8B;IAC9B,MAAM,qBAAqB,GAAG,EAAE,CAAC,aAAa,CAAC,0CAA0C,CAAC,CAAC;IAE3F,IAAI,qBAAqB,EAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,qBAAqB,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAEhE,IAAI,WAAW,EAAE,CAAC;YACjB,6DAA6D;YAC7D,MAAM,eAAe,GAAG,WAAW,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YAC5D,MAAM,oBAAoB,GAAG,qBAAqB,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YAC3E,MAAM,OAAO,GAAG,eAAe,KAAK,OAAO,IAAI,oBAAoB,KAAK,OAAO,CAAC;YAEhF,OAAO;gBACN,MAAM,EAAE,WAAW,CAAC,SAAS;gBAC7B,KAAK,EAAE,WAAW,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,IAAI;gBAClD,OAAO;aACP,CAAC;QACH,CAAC;IACF,CAAC;IAED,kBAAkB;IAClB,MAAM,WAAW,GAAG,EAAE,CAAC,aAAa,CAAC,oBAAoB,CAAC,CAAC;IAC3D,IAAI,WAAW,EAAE,CAAC;QACjB,OAAO;YACN,MAAM,EAAE,WAAW,CAAC,SAAS;YAC7B,KAAK,EAAE,IAAI,EAAE,uCAAuC;YACpD,OAAO,EAAE,KAAK,CAAC,sCAAsC;SACrD,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACb,CAAC,CAAC;AA1DW,QAAA,oBAAoB,wBA0D/B;AAEK,MAAM,wBAAwB,GAAG,CAAC,EAAW,EAAiB,EAAE;IACtE,8BAA8B;IAC9B,MAAM,SAAS,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;IAChD,IAAI,SAAS,EAAE,CAAC;QACf,OAAO,SAAS,CAAC;IAClB,CAAC;IAED,yBAAyB;IACzB,IAAI,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,KAAK,IAAI,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1E,qCAAqC;QACrC,MAAM,QAAQ,GAAG,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QACxC,IAAI,QAAQ,EAAE,CAAC;YACd,OAAO,QAAQ,CAAC;QACjB,CAAC;QAED,kCAAkC;QAClC,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QACnC,IAAI,GAAG,EAAE,CAAC;YACT,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAC;YACrD,IAAI,KAAK,EAAE,CAAC;gBACX,OAAO,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;qBACjC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,wBAAwB;qBAC5C,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,0BAA0B;YACpD,CAAC;QACF,CAAC;IACF,CAAC;IAED,sBAAsB;IACtB,MAAM,UAAU,GAAG,EAAE,CAAC,aAAa,CAAC,0CAA0C,CAAC,CAAC;IAChF,IAAI,UAAU,EAAE,WAAW,EAAE,CAAC;QAC7B,OAAO,UAAU,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;IACtC,CAAC;IAED,gBAAgB;IAChB,IAAI,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC1B,MAAM,eAAe,GAAG,EAAE,CAAC,aAAa,CAAC,wDAAwD,CAAC,CAAC;QACnG,IAAI,eAAe,EAAE,WAAW,EAAE,CAAC;YAClC,OAAO,eAAe,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;QAC3C,CAAC;IACF,CAAC;IAED,kBAAkB;IAClB,IAAI,EAAE,CAAC,OAAO,CAAC,yBAAyB,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,uCAAuC,CAAC,EAAE,CAAC;QAClG,OAAO,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,IAAI,CAAC;IACvC,CAAC;IAED,mCAAmC;IACnC,IAAI,EAAE,CAAC,aAAa,EAAE,CAAC;QACtB,MAAM,aAAa,GAAG,EAAE,CAAC,aAAa,CAAC,aAAa,CAAC,gEAAgE,CAAC,CAAC;QACvH,IAAI,aAAa,EAAE,CAAC;YACnB,OAAO,aAAa,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,IAAI,CAAC;QAClD,CAAC;IACF,CAAC;IAED,uCAAuC;IACvC,OAAO,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,IAAI,CAAC;AACjE,CAAC,CAAC;AAxDW,QAAA,wBAAwB,4BAwDnC;AAEK,MAAM,cAAc,GAAG,CAAC,EAAW,EAAW,EAAE;IACtD,mCAAmC;IACnC,MAAM,WAAW,GAAG,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;IAC/C,IAAI,WAAW,KAAK,OAAO,EAAE,CAAC;QAC7B,OAAO,IAAI,CAAC;IACb,CAAC;IAED,2BAA2B;IAC3B,MAAM,UAAU,GAAG,EAAE,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC;IAC9C,IAAI,UAAU,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QACpE,OAAO,IAAI,CAAC;IACb,CAAC;IAED,0BAA0B;IAC1B,MAAM,SAAS,GAAG,EAAE,CAAC,OAAO,CAAC,0DAA0D,CAAC,CAAC;IACzF,IAAI,SAAS,EAAE,CAAC;QACf,OAAO,IAAI,CAAC;IACb,CAAC;IAED,qCAAqC;IACrC,MAAM,WAAW,GAAG,EAAE,CAAC,sBAAsB,CAAC;IAC9C,IAAI,WAAW,EAAE,OAAO,CAAC,WAAW,EAAE,KAAK,GAAG,EAAE,CAAC;QAChD,OAAO,IAAI,CAAC;IACb,CAAC;IAED,yBAAyB;IACzB,IAAI,EAAE,CAAC,OAAO,CAAC,kCAAkC,CAAC,EAAE,CAAC;QACpD,OAAO,IAAI,CAAC;IACb,CAAC;IAED,2BAA2B;IAC3B,IAAI,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC1B,uCAAuC;QACvC,6CAA6C;QAC7C,OAAO,EAAE,CAAC,OAAO,CAAC,gBAAgB,CAAC,KAAK,IAAI,CAAC;IAC9C,CAAC;IAED,qCAAqC;IACrC,IAAI,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,MAAM,CAAC;IAC9C,CAAC;IAED,yCAAyC;IACzC,IAAI,EAAE,CAAC,OAAO,CAAC,uCAAuC,CAAC,EAAE,CAAC;QACzD,OAAO,IAAI,CAAC;IACb,CAAC;IAED,IAAI,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,EAAE,CAAC;QAChC,OAAO,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,MAAM,CAAC;IAC9C,CAAC;IAED,2CAA2C;IAC3C,MAAM,eAAe,GAAG,EAAE,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;IAChD,IAAI,eAAe,EAAE,CAAC;QACrB,OAAO,eAAe,CAAC,YAAY,CAAC,SAAS,CAAC,KAAK,MAAM,CAAC;IAC3D,CAAC;IAED,OAAO,KAAK,CAAC;AACd,CAAC,CAAC;AA1DW,QAAA,cAAc,kBA0DzB;AAEF,oCAAoC;AACvB,QAAA,aAAa,GAAG;IAC5B,yBAAyB;IACzB,6BAA6B;IAE7B,+BAA+B;IAC/B,cAAc;IACd,eAAe;IACf,yBAAyB;IACzB,uCAAuC;IACvC,4CAA4C;IAC5C,kBAAkB;IAClB,cAAc;IACd,iBAAiB;IAEjB,0BAA0B;IAC1B,mBAAmB;IACnB,iCAAiC;IACjC,kCAAkC;IAClC,yBAAyB;IACzB,0BAA0B;IAE1B,iBAAiB;IACjB,QAAQ;IACR,gBAAgB;IAChB,eAAe;IACf,aAAa;IACb,cAAc;IACd,2BAA2B;IAE3B,0CAA0C;IAC1C,MAAM;IACN,aAAa;IACb,cAAc;IACd,YAAY;IACZ,uBAAuB;IACvB,0CAA0C;CAC1C,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC"}
@@ -1,6 +1,6 @@
1
1
  import { MathData } from './math.base';
2
2
  export declare const getLatexFromElement: (el: Element) => string | null;
3
- export declare const createCleanMathEl: (mathData: MathData | null, latex: string | null, isBlock: boolean) => Element;
3
+ export declare const createCleanMathEl: (mathData: MathData | null, latex: string | null, isBlock: boolean, doc: Document) => Element;
4
4
  export declare const mathRules: {
5
5
  selector: string;
6
6
  element: string;
@@ -0,0 +1,121 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.mathRules = exports.createCleanMathEl = exports.getLatexFromElement = void 0;
37
+ const mathml_to_latex_1 = require("mathml-to-latex");
38
+ const temml = __importStar(require("temml"));
39
+ const math_base_1 = require("./math.base");
40
+ const getLatexFromElement = (el) => {
41
+ // First try basic LaTeX extraction
42
+ const basicLatex = (0, math_base_1.getBasicLatexFromElement)(el);
43
+ if (basicLatex) {
44
+ return basicLatex;
45
+ }
46
+ // If no LaTeX found but we have MathML, convert it
47
+ const mathData = (0, math_base_1.getMathMLFromElement)(el);
48
+ if (mathData?.mathml) {
49
+ try {
50
+ return mathml_to_latex_1.MathMLToLaTeX.convert(mathData.mathml);
51
+ }
52
+ catch (e) {
53
+ console.warn('Failed to convert MathML to LaTeX:', e);
54
+ }
55
+ }
56
+ return null;
57
+ };
58
+ exports.getLatexFromElement = getLatexFromElement;
59
+ const createCleanMathEl = (mathData, latex, isBlock, doc) => {
60
+ const cleanMathEl = doc.createElement('math');
61
+ cleanMathEl.setAttribute('xmlns', 'http://www.w3.org/1998/Math/MathML');
62
+ cleanMathEl.setAttribute('display', isBlock ? 'block' : 'inline');
63
+ cleanMathEl.setAttribute('data-latex', latex || '');
64
+ // First try to use existing MathML content
65
+ if (mathData?.mathml) {
66
+ const tempDiv = doc.createElement('div');
67
+ tempDiv.innerHTML = mathData.mathml;
68
+ const mathContent = tempDiv.querySelector('math');
69
+ if (mathContent) {
70
+ cleanMathEl.innerHTML = mathContent.innerHTML;
71
+ }
72
+ }
73
+ // If no MathML but we have LaTeX, convert it
74
+ else if (latex) {
75
+ try {
76
+ const mathml = temml.renderToString(latex, {
77
+ displayMode: isBlock,
78
+ throwOnError: false
79
+ });
80
+ const tempDiv = doc.createElement('div');
81
+ tempDiv.innerHTML = mathml;
82
+ const mathContent = tempDiv.querySelector('math');
83
+ if (mathContent) {
84
+ cleanMathEl.innerHTML = mathContent.innerHTML;
85
+ }
86
+ else {
87
+ cleanMathEl.textContent = latex; // Fallback to LaTeX as text
88
+ }
89
+ }
90
+ catch (e) {
91
+ console.warn('Failed to convert LaTeX to MathML:', e);
92
+ cleanMathEl.textContent = latex; // Fallback to LaTeX as text
93
+ }
94
+ }
95
+ return cleanMathEl;
96
+ };
97
+ exports.createCleanMathEl = createCleanMathEl;
98
+ exports.mathRules = [
99
+ {
100
+ selector: math_base_1.mathSelectors,
101
+ element: 'math',
102
+ transform: (el) => {
103
+ // Check if element is an HTMLElement by checking for common properties
104
+ if (!('style' in el) || !('className' in el)) {
105
+ return el;
106
+ }
107
+ const mathData = (0, math_base_1.getMathMLFromElement)(el);
108
+ const latex = (0, exports.getLatexFromElement)(el);
109
+ const isBlock = (0, math_base_1.isBlockDisplay)(el);
110
+ const cleanMathEl = (0, exports.createCleanMathEl)(mathData, latex, isBlock, el.ownerDocument);
111
+ // Clean up any associated math scripts after we've extracted their content
112
+ if (el.parentElement) {
113
+ // Remove all math-related scripts and previews
114
+ const mathElements = el.parentElement.querySelectorAll('script[type^="math/"], .MathJax_Preview, script[type="text/javascript"][src*="mathjax"], script[type="text/javascript"][src*="katex"]');
115
+ mathElements.forEach(el => el.remove());
116
+ }
117
+ return cleanMathEl;
118
+ }
119
+ }
120
+ ];
121
+ //# sourceMappingURL=math.full.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"math.full.js","sourceRoot":"","sources":["../../src/elements/math.full.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,qDAAgD;AAChD,6CAA+B;AAC/B,2CAMqB;AAEd,MAAM,mBAAmB,GAAG,CAAC,EAAW,EAAiB,EAAE;IACjE,mCAAmC;IACnC,MAAM,UAAU,GAAG,IAAA,oCAAwB,EAAC,EAAE,CAAC,CAAC;IAChD,IAAI,UAAU,EAAE,CAAC;QAChB,OAAO,UAAU,CAAC;IACnB,CAAC;IAED,mDAAmD;IACnD,MAAM,QAAQ,GAAG,IAAA,gCAAoB,EAAC,EAAE,CAAC,CAAC;IAC1C,IAAI,QAAQ,EAAE,MAAM,EAAE,CAAC;QACtB,IAAI,CAAC;YACJ,OAAO,+BAAa,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC/C,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,oCAAoC,EAAE,CAAC,CAAC,CAAC;QACvD,CAAC;IACF,CAAC;IAED,OAAO,IAAI,CAAC;AACb,CAAC,CAAC;AAlBW,QAAA,mBAAmB,uBAkB9B;AAEK,MAAM,iBAAiB,GAAG,CAAC,QAAyB,EAAE,KAAoB,EAAE,OAAgB,EAAE,GAAa,EAAW,EAAE;IAC9H,MAAM,WAAW,GAAG,GAAG,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;IAE9C,WAAW,CAAC,YAAY,CAAC,OAAO,EAAE,oCAAoC,CAAC,CAAC;IACxE,WAAW,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAClE,WAAW,CAAC,YAAY,CAAC,YAAY,EAAE,KAAK,IAAI,EAAE,CAAC,CAAC;IAEpD,2CAA2C;IAC3C,IAAI,QAAQ,EAAE,MAAM,EAAE,CAAC;QACtB,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACzC,OAAO,CAAC,SAAS,GAAG,QAAQ,CAAC,MAAM,CAAC;QACpC,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAClD,IAAI,WAAW,EAAE,CAAC;YACjB,WAAW,CAAC,SAAS,GAAG,WAAW,CAAC,SAAS,CAAC;QAC/C,CAAC;IACF,CAAC;IACD,6CAA6C;SACxC,IAAI,KAAK,EAAE,CAAC;QAChB,IAAI,CAAC;YACJ,MAAM,MAAM,GAAG,KAAK,CAAC,cAAc,CAAC,KAAK,EAAE;gBAC1C,WAAW,EAAE,OAAO;gBACpB,YAAY,EAAE,KAAK;aACnB,CAAC,CAAC;YACH,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YACzC,OAAO,CAAC,SAAS,GAAG,MAAM,CAAC;YAC3B,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAClD,IAAI,WAAW,EAAE,CAAC;gBACjB,WAAW,CAAC,SAAS,GAAG,WAAW,CAAC,SAAS,CAAC;YAC/C,CAAC;iBAAM,CAAC;gBACP,WAAW,CAAC,WAAW,GAAG,KAAK,CAAC,CAAC,4BAA4B;YAC9D,CAAC;QACF,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,oCAAoC,EAAE,CAAC,CAAC,CAAC;YACtD,WAAW,CAAC,WAAW,GAAG,KAAK,CAAC,CAAC,4BAA4B;QAC9D,CAAC;IACF,CAAC;IAED,OAAO,WAAW,CAAC;AACpB,CAAC,CAAC;AAtCW,QAAA,iBAAiB,qBAsC5B;AAEW,QAAA,SAAS,GAAG;IACxB;QACC,QAAQ,EAAE,yBAAa;QACvB,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,CAAC,EAAW,EAAW,EAAE;YACnC,uEAAuE;YACvE,IAAI,CAAC,CAAC,OAAO,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,WAAW,IAAI,EAAE,CAAC,EAAE,CAAC;gBAC9C,OAAO,EAAE,CAAC;YACX,CAAC;YAED,MAAM,QAAQ,GAAG,IAAA,gCAAoB,EAAC,EAAE,CAAC,CAAC;YAC1C,MAAM,KAAK,GAAG,IAAA,2BAAmB,EAAC,EAAE,CAAC,CAAC;YACtC,MAAM,OAAO,GAAG,IAAA,0BAAc,EAAC,EAAE,CAAC,CAAC;YACnC,MAAM,WAAW,GAAG,IAAA,yBAAiB,EAAC,QAAQ,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE,CAAC,aAAa,CAAC,CAAC;YAElF,2EAA2E;YAC3E,IAAI,EAAE,CAAC,aAAa,EAAE,CAAC;gBACtB,+CAA+C;gBAC/C,MAAM,YAAY,GAAG,EAAE,CAAC,aAAa,CAAC,gBAAgB,CACrD,uIAAuI,CACvI,CAAC;gBACF,YAAY,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;YACzC,CAAC;YAED,OAAO,WAAW,CAAC;QACpB,CAAC;KACD;CACD,CAAC"}
@@ -0,0 +1,15 @@
1
+ import { BaseExtractor } from './extractors/_base';
2
+ type ExtractorConstructor = new (document: Document, url: string, schemaOrgData?: any) => BaseExtractor;
3
+ interface ExtractorMapping {
4
+ patterns: (string | RegExp)[];
5
+ extractor: ExtractorConstructor;
6
+ }
7
+ export declare class ExtractorRegistry {
8
+ private static mappings;
9
+ private static domainCache;
10
+ static initialize(): void;
11
+ static register(mapping: ExtractorMapping): void;
12
+ static findExtractor(document: Document, url: string, schemaOrgData?: any): BaseExtractor | null;
13
+ static clearCache(): void;
14
+ }
15
+ export {};
@@ -0,0 +1,101 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.ExtractorRegistry = void 0;
4
+ // Extractors
5
+ const reddit_1 = require("./extractors/reddit");
6
+ const twitter_1 = require("./extractors/twitter");
7
+ const youtube_1 = require("./extractors/youtube");
8
+ const hackernews_1 = require("./extractors/hackernews");
9
+ const chatgpt_1 = require("./extractors/chatgpt");
10
+ const claude_1 = require("./extractors/claude");
11
+ class ExtractorRegistry {
12
+ static initialize() {
13
+ // Register all extractors with their URL patterns
14
+ this.register({
15
+ patterns: [
16
+ 'twitter.com',
17
+ /\/x\.com\/.*/,
18
+ ],
19
+ extractor: twitter_1.TwitterExtractor
20
+ });
21
+ this.register({
22
+ patterns: [
23
+ 'reddit.com',
24
+ 'old.reddit.com',
25
+ 'new.reddit.com',
26
+ /^https:\/\/[^\/]+\.reddit\.com/
27
+ ],
28
+ extractor: reddit_1.RedditExtractor
29
+ });
30
+ this.register({
31
+ patterns: [
32
+ 'youtube.com',
33
+ 'youtu.be',
34
+ /youtube\.com\/watch\?v=.*/,
35
+ /youtu\.be\/.*/
36
+ ],
37
+ extractor: youtube_1.YoutubeExtractor
38
+ });
39
+ this.register({
40
+ patterns: [
41
+ /news\.ycombinator\.com\/item\?id=.*/
42
+ ],
43
+ extractor: hackernews_1.HackerNewsExtractor
44
+ });
45
+ this.register({
46
+ patterns: [
47
+ /^https?:\/\/chatgpt\.com\/(c|share)\/.*/
48
+ ],
49
+ extractor: chatgpt_1.ChatGPTExtractor
50
+ });
51
+ this.register({
52
+ patterns: [
53
+ /^https?:\/\/claude\.ai\/(chat|share)\/.*/
54
+ ],
55
+ extractor: claude_1.ClaudeExtractor
56
+ });
57
+ }
58
+ static register(mapping) {
59
+ this.mappings.push(mapping);
60
+ }
61
+ static findExtractor(document, url, schemaOrgData) {
62
+ try {
63
+ const domain = new URL(url).hostname;
64
+ // Check cache first
65
+ if (this.domainCache.has(domain)) {
66
+ const cachedExtractor = this.domainCache.get(domain);
67
+ return cachedExtractor ? new cachedExtractor(document, url, schemaOrgData) : null;
68
+ }
69
+ // Find matching extractor
70
+ for (const { patterns, extractor } of this.mappings) {
71
+ const matches = patterns.some(pattern => {
72
+ if (pattern instanceof RegExp) {
73
+ return pattern.test(url);
74
+ }
75
+ return domain.includes(pattern);
76
+ });
77
+ if (matches) {
78
+ // Cache the result
79
+ this.domainCache.set(domain, extractor);
80
+ return new extractor(document, url, schemaOrgData);
81
+ }
82
+ }
83
+ // Cache the negative result
84
+ this.domainCache.set(domain, null);
85
+ return null;
86
+ }
87
+ catch (error) {
88
+ console.error('Error in findExtractor:', error);
89
+ return null;
90
+ }
91
+ }
92
+ static clearCache() {
93
+ this.domainCache.clear();
94
+ }
95
+ }
96
+ exports.ExtractorRegistry = ExtractorRegistry;
97
+ ExtractorRegistry.mappings = [];
98
+ ExtractorRegistry.domainCache = new Map();
99
+ // Initialize extractors
100
+ ExtractorRegistry.initialize();
101
+ //# sourceMappingURL=extractor-registry.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor-registry.js","sourceRoot":"","sources":["../src/extractor-registry.ts"],"names":[],"mappings":";;;AAEA,aAAa;AACb,gDAAsD;AACtD,kDAAwD;AACxD,kDAAwD;AACxD,wDAA8D;AAC9D,kDAAwD;AACxD,gDAAsD;AAStD,MAAa,iBAAiB;IAI7B,MAAM,CAAC,UAAU;QAChB,kDAAkD;QAClD,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,aAAa;gBACb,cAAc;aACd;YACD,SAAS,EAAE,0BAAgB;SAC3B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,YAAY;gBACZ,gBAAgB;gBAChB,gBAAgB;gBAChB,gCAAgC;aAChC;YACD,SAAS,EAAE,wBAAe;SAC1B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,aAAa;gBACb,UAAU;gBACV,2BAA2B;gBAC3B,eAAe;aACf;YACD,SAAS,EAAE,0BAAgB;SAC3B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,qCAAqC;aACrC;YACD,SAAS,EAAE,gCAAmB;SAC9B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,yCAAyC;aACzC;YACD,SAAS,EAAE,0BAAgB;SAC3B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC;YACb,QAAQ,EAAE;gBACT,0CAA0C;aAC1C;YACD,SAAS,EAAE,wBAAe;SAC1B,CAAC,CAAC;IACJ,CAAC;IAED,MAAM,CAAC,QAAQ,CAAC,OAAyB;QACxC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;IAED,MAAM,CAAC,aAAa,CAAC,QAAkB,EAAE,GAAW,EAAE,aAAmB;QACxE,IAAI,CAAC;YACJ,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YAErC,oBAAoB;YACpB,IAAI,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;gBAClC,MAAM,eAAe,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;gBACrD,OAAO,eAAe,CAAC,CAAC,CAAC,IAAI,eAAe,CAAC,QAAQ,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YACnF,CAAC;YAED,0BAA0B;YAC1B,KAAK,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACrD,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE;oBACvC,IAAI,OAAO,YAAY,MAAM,EAAE,CAAC;wBAC/B,OAAO,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBAC1B,CAAC;oBACD,OAAO,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;gBACjC,CAAC,CAAC,CAAC;gBAEH,IAAI,OAAO,EAAE,CAAC;oBACb,mBAAmB;oBACnB,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;oBACxC,OAAO,IAAI,SAAS,CAAC,QAAQ,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;gBACpD,CAAC;YACF,CAAC;YAED,4BAA4B;YAC5B,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;YACnC,OAAO,IAAI,CAAC;QAEb,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,OAAO,CAAC,KAAK,CAAC,yBAAyB,EAAE,KAAK,CAAC,CAAC;YAChD,OAAO,IAAI,CAAC;QACb,CAAC;IACF,CAAC;IAED,MAAM,CAAC,UAAU;QAChB,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;IAC1B,CAAC;;AAlGF,8CAmGC;AAlGe,0BAAQ,GAAuB,EAAE,CAAC;AAClC,6BAAW,GAA6C,IAAI,GAAG,EAAE,CAAC;AAmGlF,wBAAwB;AACxB,iBAAiB,CAAC,UAAU,EAAE,CAAC"}
@@ -0,0 +1,9 @@
1
+ import { ExtractorResult } from '../types/extractors';
2
+ export declare abstract class BaseExtractor {
3
+ protected document: Document;
4
+ protected url: string;
5
+ protected schemaOrgData?: any;
6
+ constructor(document: Document, url: string, schemaOrgData?: any);
7
+ abstract canExtract(): boolean;
8
+ abstract extract(): ExtractorResult;
9
+ }
@@ -0,0 +1,12 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.BaseExtractor = void 0;
4
+ class BaseExtractor {
5
+ constructor(document, url, schemaOrgData) {
6
+ this.document = document;
7
+ this.url = url;
8
+ this.schemaOrgData = schemaOrgData;
9
+ }
10
+ }
11
+ exports.BaseExtractor = BaseExtractor;
12
+ //# sourceMappingURL=_base.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"_base.js","sourceRoot":"","sources":["../../src/extractors/_base.ts"],"names":[],"mappings":";;;AAEA,MAAsB,aAAa;IAKlC,YAAY,QAAkB,EAAE,GAAW,EAAE,aAAmB;QAC/D,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC;QACf,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC;IACpC,CAAC;CAID;AAbD,sCAaC"}
@@ -0,0 +1,9 @@
1
+ import { BaseExtractor } from './_base';
2
+ import { ConversationMessage, ConversationMetadata, Footnote, ExtractorResult } from '../types/extractors';
3
+ export declare abstract class ConversationExtractor extends BaseExtractor {
4
+ protected abstract extractMessages(): ConversationMessage[];
5
+ protected abstract getMetadata(): ConversationMetadata;
6
+ protected getFootnotes(): Footnote[];
7
+ extract(): ExtractorResult;
8
+ protected createContentHtml(messages: ConversationMessage[], footnotes: Footnote[]): string;
9
+ }
@@ -0,0 +1,77 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.ConversationExtractor = void 0;
4
+ const _base_1 = require("./_base");
5
+ const defuddle_1 = require("../defuddle");
6
+ class ConversationExtractor extends _base_1.BaseExtractor {
7
+ getFootnotes() {
8
+ return [];
9
+ }
10
+ extract() {
11
+ const messages = this.extractMessages();
12
+ const metadata = this.getMetadata();
13
+ const footnotes = this.getFootnotes();
14
+ const rawContentHtml = this.createContentHtml(messages, footnotes);
15
+ // Create a temporary document to run Defuddle on our content
16
+ const tempDoc = document.implementation.createHTMLDocument();
17
+ const container = tempDoc.createElement('article');
18
+ container.innerHTML = rawContentHtml;
19
+ tempDoc.body.appendChild(container);
20
+ // Run Defuddle on our formatted content
21
+ const defuddled = new defuddle_1.Defuddle(tempDoc).parse();
22
+ const contentHtml = defuddled.content;
23
+ return {
24
+ content: contentHtml,
25
+ contentHtml: contentHtml,
26
+ extractedContent: {
27
+ messageCount: messages.length.toString(),
28
+ },
29
+ variables: {
30
+ title: metadata.title || 'Conversation',
31
+ site: metadata.site,
32
+ description: metadata.description || `${metadata.site} conversation with ${messages.length} messages`,
33
+ wordCount: defuddled.wordCount?.toString() || '',
34
+ }
35
+ };
36
+ }
37
+ createContentHtml(messages, footnotes) {
38
+ const messagesHtml = messages.map((message, index) => {
39
+ const timestampHtml = message.timestamp ?
40
+ `<div class="message-timestamp">${message.timestamp}</div>` : '';
41
+ // Check if content already has paragraph tags
42
+ const hasParagraphs = /<p[^>]*>[\s\S]*?<\/p>/i.test(message.content);
43
+ const contentHtml = hasParagraphs ? message.content : `<p>${message.content}</p>`;
44
+ // Add metadata to data attributes
45
+ const dataAttributes = message.metadata ?
46
+ Object.entries(message.metadata)
47
+ .map(([key, value]) => `data-${key}="${value}"`)
48
+ .join(' ') : '';
49
+ return `
50
+ <div class="message message-${message.author.toLowerCase()}" ${dataAttributes}>
51
+ <div class="message-header">
52
+ <p class="message-author"><strong>${message.author}</strong></p>
53
+ ${timestampHtml}
54
+ </div>
55
+ <div class="message-content">
56
+ ${contentHtml}
57
+ </div>
58
+ </div>${index < messages.length - 1 ? '\n<hr>' : ''}`;
59
+ }).join('\n').trim();
60
+ // Add footnotes section if we have any
61
+ const footnotesHtml = footnotes.length > 0 ? `
62
+ <div id="footnotes">
63
+ <ol>
64
+ ${footnotes.map((footnote, index) => `
65
+ <li class="footnote" id="fn:${index + 1}">
66
+ <p>
67
+ <a href="${footnote.url}" target="_blank">${footnote.text}</a>&nbsp;<a href="#fnref:${index + 1}" class="footnote-backref">↩</a>
68
+ </p>
69
+ </li>
70
+ `).join('')}
71
+ </ol>
72
+ </div>` : '';
73
+ return `${messagesHtml}\n${footnotesHtml}`.trim();
74
+ }
75
+ }
76
+ exports.ConversationExtractor = ConversationExtractor;
77
+ //# sourceMappingURL=_conversation.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"_conversation.js","sourceRoot":"","sources":["../../src/extractors/_conversation.ts"],"names":[],"mappings":";;;AAAA,mCAAwC;AAExC,0CAAuC;AAEvC,MAAsB,qBAAsB,SAAQ,qBAAa;IAGtD,YAAY;QACrB,OAAO,EAAE,CAAC;IACX,CAAC;IAED,OAAO;QACN,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,EAAE,CAAC;QACxC,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,cAAc,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAEnE,6DAA6D;QAC7D,MAAM,OAAO,GAAG,QAAQ,CAAC,cAAc,CAAC,kBAAkB,EAAE,CAAC;QAC7D,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;QACnD,SAAS,CAAC,SAAS,GAAG,cAAc,CAAC;QACrC,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;QAEpC,wCAAwC;QACxC,MAAM,SAAS,GAAG,IAAI,mBAAQ,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,CAAC;QAChD,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC;QAEtC,OAAO;YACN,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,WAAW;YACxB,gBAAgB,EAAE;gBACjB,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,QAAQ,EAAE;aACxC;YACD,SAAS,EAAE;gBACV,KAAK,EAAE,QAAQ,CAAC,KAAK,IAAI,cAAc;gBACvC,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,WAAW,EAAE,QAAQ,CAAC,WAAW,IAAI,GAAG,QAAQ,CAAC,IAAI,sBAAsB,QAAQ,CAAC,MAAM,WAAW;gBACrG,SAAS,EAAE,SAAS,CAAC,SAAS,EAAE,QAAQ,EAAE,IAAI,EAAE;aAChD;SACD,CAAC;IACH,CAAC;IAES,iBAAiB,CAAC,QAA+B,EAAE,SAAqB;QACjF,MAAM,YAAY,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE;YACpD,MAAM,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;gBACxC,kCAAkC,OAAO,CAAC,SAAS,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;YAElE,8CAA8C;YAC9C,MAAM,aAAa,GAAG,wBAAwB,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YACrE,MAAM,WAAW,GAAG,aAAa,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,CAAC,OAAO,MAAM,CAAC;YAElF,kCAAkC;YAClC,MAAM,cAAc,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;gBACxC,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC;qBAC9B,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,QAAQ,GAAG,KAAK,KAAK,GAAG,CAAC;qBAC/C,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAElB,OAAO;iCACuB,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,KAAK,cAAc;;yCAEvC,OAAO,CAAC,MAAM;OAChD,aAAa;;;OAGb,WAAW;;WAEP,KAAK,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;QACvD,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAErB,uCAAuC;QACvC,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;;;OAGxC,SAAS,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,EAAE,CAAC;oCACN,KAAK,GAAG,CAAC;;mBAE1B,QAAQ,CAAC,GAAG,qBAAqB,QAAQ,CAAC,IAAI,6BAA6B,KAAK,GAAG,CAAC;;;MAGjG,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;;UAEN,CAAC,CAAC,CAAC,EAAE,CAAC;QAEd,OAAO,GAAG,YAAY,KAAK,aAAa,EAAE,CAAC,IAAI,EAAE,CAAC;IACnD,CAAC;CACD;AAjFD,sDAiFC"}
@@ -0,0 +1,13 @@
1
+ import { ConversationExtractor } from './_conversation';
2
+ import { ConversationMessage, ConversationMetadata, Footnote } from '../types/extractors';
3
+ export declare class ChatGPTExtractor extends ConversationExtractor {
4
+ private articles;
5
+ private footnotes;
6
+ private footnoteCounter;
7
+ constructor(document: Document, url: string);
8
+ canExtract(): boolean;
9
+ protected extractMessages(): ConversationMessage[];
10
+ protected getFootnotes(): Footnote[];
11
+ protected getMetadata(): ConversationMetadata;
12
+ private getTitle;
13
+ }