@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Remove permalink anchors from inside heading elements.
3
+ * Handles symbols (#, ¶, §, 🔗), empty links, and class-based anchors.
4
+ */
5
+ export declare function removeHeadingAnchors(element: Element): void;
6
+ export declare function isPermalinkAnchor(node: Element): boolean;
7
+ export declare const headingRules: {
8
+ selector: string;
9
+ element: string;
10
+ transform: (el: Element) => Element;
11
+ }[];
@@ -0,0 +1,100 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.headingRules = void 0;
4
+ exports.removeHeadingAnchors = removeHeadingAnchors;
5
+ exports.isPermalinkAnchor = isPermalinkAnchor;
6
+ const constants_1 = require("../constants");
7
+ /**
8
+ * Remove permalink anchors from inside heading elements.
9
+ * Handles symbols (#, ¶, §, 🔗), empty links, and class-based anchors.
10
+ */
11
+ function removeHeadingAnchors(element) {
12
+ Array.from(element.querySelectorAll('h1 a, h2 a, h3 a, h4 a, h5 a, h6 a')).forEach(link => {
13
+ if (isPermalinkAnchor(link)) {
14
+ link.remove();
15
+ }
16
+ });
17
+ }
18
+ function isPermalinkAnchor(node) {
19
+ if (node.tagName.toLowerCase() !== 'a')
20
+ return false;
21
+ const href = node.getAttribute('href') || '';
22
+ const title = (node.getAttribute('title') || '').toLowerCase();
23
+ const className = (node.getAttribute('class') || '').toLowerCase();
24
+ const text = (node.textContent || '').trim();
25
+ if (href.startsWith('#') || href.includes('#'))
26
+ return true;
27
+ if (title.includes('permalink'))
28
+ return true;
29
+ if (className.includes('permalink') || className.includes('heading-anchor') || className.includes('anchor-link'))
30
+ return true;
31
+ if (/^[#¶§🔗]$/.test(text))
32
+ return true;
33
+ return false;
34
+ }
35
+ function isHeadingNavElement(node) {
36
+ const tag = node.tagName.toLowerCase();
37
+ if (tag === 'button')
38
+ return true;
39
+ if (tag === 'a' && isPermalinkAnchor(node))
40
+ return true;
41
+ if (node.classList.contains('anchor') || node.classList.contains('permalink-widget'))
42
+ return true;
43
+ if ((tag === 'span' || tag === 'div') && Array.from(node.querySelectorAll('a')).some(a => isPermalinkAnchor(a))) {
44
+ return true;
45
+ }
46
+ return false;
47
+ }
48
+ exports.headingRules = [
49
+ // Simplify headings by removing internal navigation elements
50
+ {
51
+ selector: 'h1, h2, h3, h4, h5, h6',
52
+ element: 'keep',
53
+ transform: (el) => {
54
+ // Get document from element's owner document
55
+ const doc = el.ownerDocument;
56
+ if (!doc) {
57
+ console.warn('No document available');
58
+ return el;
59
+ }
60
+ // Create new heading of same level
61
+ const newHeading = doc.createElement(el.tagName);
62
+ // Copy allowed attributes from original heading
63
+ Array.from(el.attributes).forEach(attr => {
64
+ if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
65
+ newHeading.setAttribute(attr.name, attr.value);
66
+ }
67
+ });
68
+ // Clone the element so we can modify it without affecting the original
69
+ const clone = el.cloneNode(true);
70
+ // Single pass: collect navigation text and build removal list
71
+ const navigationText = new Map();
72
+ const toRemove = [];
73
+ Array.from(clone.querySelectorAll('*')).forEach(child => {
74
+ if (!isHeadingNavElement(child))
75
+ return;
76
+ navigationText.set(child, child.textContent?.trim() || '');
77
+ // If this element contains the only text content of its parent,
78
+ // store its text to be used for the parent
79
+ const parent = child.parentElement;
80
+ if (parent && parent !== clone &&
81
+ parent.textContent?.trim() === child.textContent?.trim()) {
82
+ navigationText.set(parent, child.textContent?.trim() || '');
83
+ }
84
+ toRemove.push(child);
85
+ });
86
+ // Remove navigation elements
87
+ toRemove.forEach(element => element.remove());
88
+ // Get the text content after removing navigation elements
89
+ let textContent = clone.textContent?.trim() || '';
90
+ // If we lost all text content but had navigation text, use that instead
91
+ if (!textContent && navigationText.size > 0) {
92
+ textContent = Array.from(navigationText.values())[0];
93
+ }
94
+ // Set the clean text content
95
+ newHeading.textContent = textContent;
96
+ return newHeading;
97
+ }
98
+ }
99
+ ];
100
+ //# sourceMappingURL=headings.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"headings.js","sourceRoot":"","sources":["../../src/elements/headings.ts"],"names":[],"mappings":";;;AAMA,oDAMC;AAED,8CAaC;AA3BD,4CAAkD;AAElD;;;GAGG;AACH,SAAgB,oBAAoB,CAAC,OAAgB;IACpD,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,oCAAoC,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;QACzF,IAAI,iBAAiB,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7B,IAAI,CAAC,MAAM,EAAE,CAAC;QACf,CAAC;IACF,CAAC,CAAC,CAAC;AACJ,CAAC;AAED,SAAgB,iBAAiB,CAAC,IAAa;IAC9C,IAAI,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,GAAG;QAAE,OAAO,KAAK,CAAC;IACrD,MAAM,IAAI,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7C,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAC/D,MAAM,SAAS,GAAG,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IACnE,MAAM,IAAI,GAAG,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAE7C,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5D,IAAI,KAAK,CAAC,QAAQ,CAAC,WAAW,CAAC;QAAE,OAAO,IAAI,CAAC;IAC7C,IAAI,SAAS,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,gBAAgB,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC;QAAE,OAAO,IAAI,CAAC;IAC9H,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAExC,OAAO,KAAK,CAAC;AACd,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAa;IACzC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;IACvC,IAAI,GAAG,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAC;IAClC,IAAI,GAAG,KAAK,GAAG,IAAI,iBAAiB,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IACxD,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,CAAC;QAAE,OAAO,IAAI,CAAC;IAClG,IAAI,CAAC,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACjH,OAAO,IAAI,CAAC;IACb,CAAC;IACD,OAAO,KAAK,CAAC;AACd,CAAC;AAEY,QAAA,YAAY,GAAG;IACxB,6DAA6D;IAChE;QACC,QAAQ,EAAE,wBAAwB;QAClC,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,CAAC,EAAW,EAAW,EAAE;YACnC,6CAA6C;YAC7C,MAAM,GAAG,GAAG,EAAE,CAAC,aAAa,CAAC;YAC7B,IAAI,CAAC,GAAG,EAAE,CAAC;gBACV,OAAO,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;gBACtC,OAAO,EAAE,CAAC;YACX,CAAC;YAED,mCAAmC;YACnC,MAAM,UAAU,GAAG,GAAG,CAAC,aAAa,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC;YAEjD,gDAAgD;YAChD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;gBACxC,IAAI,8BAAkB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;oBACvC,UAAU,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;gBAChD,CAAC;YACF,CAAC,CAAC,CAAC;YAEH,uEAAuE;YACvE,MAAM,KAAK,GAAG,EAAE,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;YAE5C,8DAA8D;YAC9D,MAAM,cAAc,GAAG,IAAI,GAAG,EAAmB,CAAC;YAClD,MAAM,QAAQ,GAAc,EAAE,CAAC;YAE/B,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;gBACvD,IAAI,CAAC,mBAAmB,CAAC,KAAK,CAAC;oBAAE,OAAO;gBAExC,cAAc,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;gBAE3D,gEAAgE;gBAChE,2CAA2C;gBAC3C,MAAM,MAAM,GAAG,KAAK,CAAC,aAAa,CAAC;gBACnC,IAAI,MAAM,IAAI,MAAM,KAAK,KAAK;oBAC7B,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,KAAK,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,EAAE,CAAC;oBAC3D,cAAc,CAAC,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC7D,CAAC;gBAED,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,6BAA6B;YAC7B,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;YAE9C,0DAA0D;YAC1D,IAAI,WAAW,GAAG,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAElD,wEAAwE;YACxE,IAAI,CAAC,WAAW,IAAI,cAAc,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBAC7C,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;YACtD,CAAC;YAED,6BAA6B;YAC7B,UAAU,CAAC,WAAW,GAAG,WAAW,CAAC;YAErC,OAAO,UAAU,CAAC;QACnB,CAAC;KACD;CACD,CAAC"}
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Standardization rules for handling images
3
+ */
4
+ export declare const imageRules: {
5
+ selector: string;
6
+ element: string;
7
+ transform: (el: Element, doc: Document) => Element;
8
+ }[];