defuddle 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/README.md +59 -21
  2. package/dist/cli.js +54 -49
  3. package/dist/cli.js.map +1 -1
  4. package/dist/constants.d.ts +9 -0
  5. package/dist/constants.js +50 -10
  6. package/dist/constants.js.map +1 -1
  7. package/dist/defuddle.d.ts +8 -1
  8. package/dist/defuddle.js +404 -86
  9. package/dist/defuddle.js.map +1 -1
  10. package/dist/elements/callouts.d.ts +6 -0
  11. package/dist/elements/callouts.js +74 -0
  12. package/dist/elements/callouts.js.map +1 -0
  13. package/dist/elements/code.js +31 -9
  14. package/dist/elements/code.js.map +1 -1
  15. package/dist/elements/headings.d.ts +6 -0
  16. package/dist/elements/headings.js +55 -50
  17. package/dist/elements/headings.js.map +1 -1
  18. package/dist/elements/images.js +10 -1
  19. package/dist/elements/images.js.map +1 -1
  20. package/dist/elements/math.base.js +1 -4
  21. package/dist/elements/math.base.js.map +1 -1
  22. package/dist/extractor-registry.d.ts +5 -5
  23. package/dist/extractor-registry.js +8 -8
  24. package/dist/extractor-registry.js.map +1 -1
  25. package/dist/extractors/_base.d.ts +6 -1
  26. package/dist/extractors/_base.js +2 -1
  27. package/dist/extractors/_base.js.map +1 -1
  28. package/dist/extractors/github.js +3 -3
  29. package/dist/extractors/github.js.map +1 -1
  30. package/dist/extractors/hackernews.js +1 -1
  31. package/dist/extractors/hackernews.js.map +1 -1
  32. package/dist/extractors/reddit.js +7 -4
  33. package/dist/extractors/reddit.js.map +1 -1
  34. package/dist/extractors/twitter.js +3 -1
  35. package/dist/extractors/twitter.js.map +1 -1
  36. package/dist/extractors/youtube.d.ts +35 -2
  37. package/dist/extractors/youtube.js +359 -30
  38. package/dist/extractors/youtube.js.map +1 -1
  39. package/dist/fetch.d.ts +13 -0
  40. package/dist/fetch.js +181 -0
  41. package/dist/fetch.js.map +1 -0
  42. package/dist/index.full.js +1 -1
  43. package/dist/index.js +1 -1
  44. package/dist/markdown.js +81 -33
  45. package/dist/markdown.js.map +1 -1
  46. package/dist/metadata.js +1 -1
  47. package/dist/metadata.js.map +1 -1
  48. package/dist/node.d.ts +12 -5
  49. package/dist/node.js +53 -17
  50. package/dist/node.js.map +1 -1
  51. package/dist/scoring.js +15 -10
  52. package/dist/scoring.js.map +1 -1
  53. package/dist/standardize.js +112 -60
  54. package/dist/standardize.js.map +1 -1
  55. package/dist/types.d.ts +14 -0
  56. package/dist/utils/dom.d.ts +5 -0
  57. package/dist/utils/dom.js +8 -0
  58. package/dist/utils/dom.js.map +1 -1
  59. package/dist/utils/linkedom-compat.d.ts +5 -0
  60. package/dist/utils/linkedom-compat.js +23 -0
  61. package/dist/utils/linkedom-compat.js.map +1 -0
  62. package/dist/utils.d.ts +6 -0
  63. package/dist/utils.js +36 -0
  64. package/dist/utils.js.map +1 -1
  65. package/package.json +3 -4
package/dist/utils.js CHANGED
@@ -7,6 +7,7 @@ exports.getComputedStyle = getComputedStyle;
7
7
  exports.getWindow = getWindow;
8
8
  exports.textPreview = textPreview;
9
9
  exports.logDebug = logDebug;
10
+ exports.countWords = countWords;
10
11
  const NODE_TYPE = {
11
12
  ELEMENT_NODE: 1,
12
13
  ATTRIBUTE_NODE: 2,
@@ -59,4 +60,39 @@ function logDebug(debug, message, ...args) {
59
60
  console.log('Defuddle:', message, ...args);
60
61
  }
61
62
  }
63
+ /**
64
+ * Count words in text, handling CJK characters (Chinese, Japanese, Korean).
65
+ * CJK characters are counted individually since they don't use spaces between words.
66
+ * Non-CJK text is counted by splitting on whitespace.
67
+ */
68
+ function countWords(text) {
69
+ if (!text)
70
+ return 0;
71
+ let cjkCount = 0;
72
+ let wordCount = 0;
73
+ let inWord = false;
74
+ for (let i = 0; i < text.length; i++) {
75
+ const code = text.charCodeAt(i);
76
+ // Check for CJK character ranges (BMP only — Extension B+ are
77
+ // surrogate pairs and would need codePointAt, rare in practice)
78
+ if ((code >= 0x3040 && code <= 0x309f) || // Hiragana
79
+ (code >= 0x30a0 && code <= 0x30ff) || // Katakana
80
+ (code >= 0x3400 && code <= 0x4dbf) || // CJK Extension A
81
+ (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
82
+ (code >= 0xf900 && code <= 0xfaff) || // CJK Compatibility Ideographs
83
+ (code >= 0xac00 && code <= 0xd7af) // Korean Hangul
84
+ ) {
85
+ cjkCount++;
86
+ inWord = false;
87
+ }
88
+ else if (code <= 32) {
89
+ inWord = false;
90
+ }
91
+ else if (!inWord) {
92
+ wordCount++;
93
+ inWord = true;
94
+ }
95
+ }
96
+ return cjkCount + wordCount;
97
+ }
62
98
  //# sourceMappingURL=utils.js.map
package/dist/utils.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":";;AAeA,8BAEC;AAED,gCAEC;AAED,sCAEC;AAED,4CAIC;AAED,8BAiBC;AAED,kCAEC;AAED,4BAIC;AA5DD,MAAM,SAAS,GAAG;IACjB,YAAY,EAAE,CAAC;IACf,cAAc,EAAE,CAAC;IACjB,SAAS,EAAE,CAAC;IACZ,kBAAkB,EAAE,CAAC;IACrB,qBAAqB,EAAE,CAAC;IACxB,WAAW,EAAE,CAAC;IACd,2BAA2B,EAAE,CAAC;IAC9B,YAAY,EAAE,CAAC;IACf,aAAa,EAAE,CAAC;IAChB,kBAAkB,EAAE,EAAE;IACtB,sBAAsB,EAAE,EAAE;IAC1B,aAAa,EAAE,EAAE;CACjB,CAAC;AAEF,SAAgB,SAAS,CAAC,IAAU;IACnC,OAAO,IAAI,CAAC,QAAQ,KAAK,SAAS,CAAC,YAAY,CAAC;AACjD,CAAC;AAED,SAAgB,UAAU,CAAC,IAAU;IACpC,OAAO,IAAI,CAAC,QAAQ,KAAK,SAAS,CAAC,SAAS,CAAC;AAC9C,CAAC;AAED,SAAgB,aAAa,CAAC,IAAU;IACvC,OAAO,IAAI,CAAC,QAAQ,KAAK,SAAS,CAAC,YAAY,CAAC;AACjD,CAAC;AAED,SAAgB,gBAAgB,CAAC,OAAgB;IAChD,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAC7C,IAAI,CAAC,GAAG;QAAE,OAAO,IAAI,CAAC;IACtB,OAAO,GAAG,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;AACtC,CAAC;AAED,SAAgB,SAAS,CAAC,GAAa;IACtC,wBAAwB;IACxB,IAAI,GAAG,CAAC,WAAW,EAAE,CAAC;QACrB,OAAO,GAAG,CAAC,WAAW,CAAC;IACxB,CAAC;IAED,uBAAuB;IACvB,IAAK,GAAW,CAAC,WAAW,EAAE,CAAC;QAC9B,OAAQ,GAAW,CAAC,WAAW,CAAC;IACjC,CAAC;IAED,0CAA0C;IAC1C,IAAK,GAAW,CAAC,MAAM,EAAE,CAAC;QACzB,OAAQ,GAAW,CAAC,MAAM,CAAC;IAC5B,CAAC;IAED,OAAO,IAAI,CAAC;AACb,CAAC;AAED,SAAgB,WAAW,CAAC,EAAW;IACtC,OAAO,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACxD,CAAC;AAED,SAAgB,QAAQ,CAAC,KAAc,EAAE,OAAe,EAAE,GAAG,IAAW;IACvE,IAAI,KAAK,EAAE,CAAC;QACX,OAAO,CAAC,GAAG,CAAC,WAAW,EAAE,OAAO,EAAE,GAAG,IAAI,CAAC,CAAC;IAC5C,CAAC;AACF,CAAC"}
1
+ {"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":";;AAeA,8BAEC;AAED,gCAEC;AAED,sCAEC;AAED,4CAIC;AAED,8BAiBC;AAED,kCAEC;AAED,4BAIC;AAOD,gCA+BC;AAlGD,MAAM,SAAS,GAAG;IACjB,YAAY,EAAE,CAAC;IACf,cAAc,EAAE,CAAC;IACjB,SAAS,EAAE,CAAC;IACZ,kBAAkB,EAAE,CAAC;IACrB,qBAAqB,EAAE,CAAC;IACxB,WAAW,EAAE,CAAC;IACd,2BAA2B,EAAE,CAAC;IAC9B,YAAY,EAAE,CAAC;IACf,aAAa,EAAE,CAAC;IAChB,kBAAkB,EAAE,EAAE;IACtB,sBAAsB,EAAE,EAAE;IAC1B,aAAa,EAAE,EAAE;CACjB,CAAC;AAEF,SAAgB,SAAS,CAAC,IAAU;IACnC,OAAO,IAAI,CAAC,QAAQ,KAAK,SAAS,CAAC,YAAY,CAAC;AACjD,CAAC;AAED,SAAgB,UAAU,CAAC,IAAU;IACpC,OAAO,IAAI,CAAC,QAAQ,KAAK,SAAS,CAAC,SAAS,CAAC;AAC9C,CAAC;AAED,SAAgB,aAAa,CAAC,IAAU;IACvC,OAAO,IAAI,CAAC,QAAQ,KAAK,SAAS,CAAC,YAAY,CAAC;AACjD,CAAC;AAED,SAAgB,gBAAgB,CAAC,OAAgB;IAChD,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAC7C,IAAI,CAAC,GAAG;QAAE,OAAO,IAAI,CAAC;IACtB,OAAO,GAAG,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;AACtC,CAAC;AAED,SAAgB,SAAS,CAAC,GAAa;IACtC,wBAAwB;IACxB,IAAI,GAAG,CAAC,WAAW,EAAE,CAAC;QACrB,OAAO,GAAG,CAAC,WAAW,CAAC;IACxB,CAAC;IAED,uBAAuB;IACvB,IAAK,GAAW,CAAC,WAAW,EAAE,CAAC;QAC9B,OAAQ,GAAW,CAAC,WAAW,CAAC;IACjC,CAAC;IAED,0CAA0C;IAC1C,IAAK,GAAW,CAAC,MAAM,EAAE,CAAC;QACzB,OAAQ,GAAW,CAAC,MAAM,CAAC;IAC5B,CAAC;IAED,OAAO,IAAI,CAAC;AACb,CAAC;AAED,SAAgB,WAAW,CAAC,EAAW;IACtC,OAAO,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACxD,CAAC;AAED,SAAgB,QAAQ,CAAC,KAAc,EAAE,OAAe,EAAE,GAAG,IAAW;IACvE,IAAI,KAAK,EAAE,CAAC;QACX,OAAO,CAAC,GAAG,CAAC,WAAW,EAAE,OAAO,EAAE,GAAG,IAAI,CAAC,CAAC;IAC5C,CAAC;AACF,CAAC;AAED;;;;GAIG;AACH,SAAgB,UAAU,CAAC,IAAY;IACtC,IAAI,CAAC,IAAI;QAAE,OAAO,CAAC,CAAC;IAEpB,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAEhC,8DAA8D;QAC9D,gEAAgE;QAChE,IACC,CAAC,IAAI,IAAI,MAAM,IAAI,IAAI,IAAI,MAAM,CAAC,IAAI,WAAW;YACjD,CAAC,IAAI,IAAI,MAAM,IAAI,IAAI,IAAI,MAAM,CAAC,IAAI,WAAW;YACjD,CAAC,IAAI,IAAI,MAAM,IAAI,IAAI,IAAI,MAAM,CAAC,IAAI,kBAAkB;YACxD,CAAC,IAAI,IAAI,MAAM,IAAI,IAAI,IAAI,MAAM,CAAC,IAAI,yBAAyB;YAC/D,CAAC,IAAI,IAAI,MAAM,IAAI,IAAI,IAAI,MAAM,CAAC,IAAI,+BAA+B;YACrE,CAAC,IAAI,IAAI,MAAM,IAAI,IAAI,IAAI,MAAM,CAAC,CAAI,gBAAgB;UACrD,CAAC;YACF,QAAQ,EAAE,CAAC;YACX,MAAM,GAAG,KAAK,CAAC;QAChB,CAAC;aAAM,IAAI,IAAI,IAAI,EAAE,EAAE,CAAC;YACvB,MAAM,GAAG,KAAK,CAAC;QAChB,CAAC;aAAM,IAAI,CAAC,MAAM,EAAE,CAAC;YACpB,SAAS,EAAE,CAAC;YACZ,MAAM,GAAG,IAAI,CAAC;QACf,CAAC;IACF,CAAC;IAED,OAAO,QAAQ,GAAG,SAAS,CAAC;AAC7B,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "defuddle",
3
- "version": "0.12.0",
3
+ "version": "0.14.0",
4
4
  "description": "Extract article content and metadata from web pages.",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
@@ -49,6 +49,7 @@
49
49
  "dev:node": "tsc --project tsconfig.node.json --watch",
50
50
  "dev": "npm run clean && npm run build:types && concurrently \"npm run dev:types\" \"npm run dev:node\" \"npm run dev:js\"",
51
51
  "test": "TZ=UTC vitest run",
52
+ "test:jsdom": "TZ=UTC DOM=jsdom vitest run",
52
53
  "playground": "node playground/server.js"
53
54
  },
54
55
  "keywords": [
@@ -75,13 +76,11 @@
75
76
  "commander": "^12.1.0"
76
77
  },
77
78
  "optionalDependencies": {
79
+ "linkedom": "^0.18.12",
78
80
  "mathml-to-latex": "^1.5.0",
79
81
  "temml": "^0.13.1",
80
82
  "turndown": "^7.2.0"
81
83
  },
82
- "peerDependencies": {
83
- "jsdom": "^24.0.0"
84
- },
85
84
  "devDependencies": {
86
85
  "@types/jsdom": "^21.1.6",
87
86
  "@types/node": "^20.19.0",