@pseolint/core 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/algorithms/authority/commoncrawl.d.ts +13 -0
- package/dist/algorithms/authority/commoncrawl.d.ts.map +1 -0
- package/dist/algorithms/authority/commoncrawl.js +17 -0
- package/dist/algorithms/authority/commoncrawl.js.map +1 -0
- package/dist/algorithms/authority/openpagerank.d.ts +19 -0
- package/dist/algorithms/authority/openpagerank.d.ts.map +1 -0
- package/dist/algorithms/authority/openpagerank.js +42 -0
- package/dist/algorithms/authority/openpagerank.js.map +1 -0
- package/dist/algorithms/authority/provider.d.ts +16 -0
- package/dist/algorithms/authority/provider.d.ts.map +1 -0
- package/dist/algorithms/authority/provider.js +24 -0
- package/dist/algorithms/authority/provider.js.map +1 -0
- package/dist/algorithms/auto-entity-mask.d.ts +19 -0
- package/dist/algorithms/auto-entity-mask.d.ts.map +1 -0
- package/dist/algorithms/auto-entity-mask.js +102 -0
- package/dist/algorithms/auto-entity-mask.js.map +1 -0
- package/dist/algorithms/example-regions.d.ts +22 -0
- package/dist/algorithms/example-regions.d.ts.map +1 -0
- package/dist/algorithms/example-regions.js +32 -0
- package/dist/algorithms/example-regions.js.map +1 -0
- package/dist/algorithms/fact-extraction.d.ts.map +1 -1
- package/dist/algorithms/fact-extraction.js +6 -0
- package/dist/algorithms/fact-extraction.js.map +1 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +39 -9
- package/dist/auditor.js.map +1 -1
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +9 -8
- package/dist/enrich-findings.js.map +1 -1
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -1
- package/dist/rules/aeo/crawler-access.d.ts +14 -0
- package/dist/rules/aeo/crawler-access.d.ts.map +1 -1
- package/dist/rules/aeo/crawler-access.js +96 -15
- package/dist/rules/aeo/crawler-access.js.map +1 -1
- package/dist/rules/aeo/summary-bait.d.ts.map +1 -1
- package/dist/rules/aeo/summary-bait.js +4 -3
- package/dist/rules/aeo/summary-bait.js.map +1 -1
- package/dist/rules/content/common-phrase-reuse.d.ts.map +1 -1
- package/dist/rules/content/common-phrase-reuse.js +7 -2
- package/dist/rules/content/common-phrase-reuse.js.map +1 -1
- package/dist/rules/content/eeat-signals.d.ts +13 -0
- package/dist/rules/content/eeat-signals.d.ts.map +1 -1
- package/dist/rules/content/eeat-signals.js +36 -4
- package/dist/rules/content/eeat-signals.js.map +1 -1
- package/dist/rules/content/regurgitated-content.d.ts.map +1 -1
- package/dist/rules/content/regurgitated-content.js +11 -2
- package/dist/rules/content/regurgitated-content.js.map +1 -1
- package/dist/rules/content/translation-no-op.d.ts.map +1 -1
- package/dist/rules/content/translation-no-op.js +5 -1
- package/dist/rules/content/translation-no-op.js.map +1 -1
- package/dist/rules/content/unique-value.d.ts +15 -1
- package/dist/rules/content/unique-value.d.ts.map +1 -1
- package/dist/rules/content/unique-value.js +46 -39
- package/dist/rules/content/unique-value.js.map +1 -1
- package/dist/rules/content/value-add.d.ts +8 -2
- package/dist/rules/content/value-add.d.ts.map +1 -1
- package/dist/rules/content/value-add.js +39 -48
- package/dist/rules/content/value-add.js.map +1 -1
- package/dist/rules/content/wikipedia-paraphrase.d.ts +12 -7
- package/dist/rules/content/wikipedia-paraphrase.d.ts.map +1 -1
- package/dist/rules/content/wikipedia-paraphrase.js +52 -13
- package/dist/rules/content/wikipedia-paraphrase.js.map +1 -1
- package/dist/rules/links/cluster-connectivity.d.ts +7 -1
- package/dist/rules/links/cluster-connectivity.d.ts.map +1 -1
- package/dist/rules/links/cluster-connectivity.js +8 -2
- package/dist/rules/links/cluster-connectivity.js.map +1 -1
- package/dist/rules/links/orphan-pages.d.ts +8 -1
- package/dist/rules/links/orphan-pages.d.ts.map +1 -1
- package/dist/rules/links/orphan-pages.js +10 -1
- package/dist/rules/links/orphan-pages.js.map +1 -1
- package/dist/rules/schema/consistency.d.ts.map +1 -1
- package/dist/rules/schema/consistency.js +37 -21
- package/dist/rules/schema/consistency.js.map +1 -1
- package/dist/rules/schema/json-ld-valid.d.ts.map +1 -1
- package/dist/rules/schema/json-ld-valid.js +8 -1
- package/dist/rules/schema/json-ld-valid.js.map +1 -1
- package/dist/rules/schema/required-fields.d.ts.map +1 -1
- package/dist/rules/schema/required-fields.js +47 -1
- package/dist/rules/schema/required-fields.js.map +1 -1
- package/dist/rules/spam/boilerplate-ratio.d.ts.map +1 -1
- package/dist/rules/spam/boilerplate-ratio.js +36 -22
- package/dist/rules/spam/boilerplate-ratio.js.map +1 -1
- package/dist/rules/spam/entity-swap.d.ts.map +1 -1
- package/dist/rules/spam/entity-swap.js +51 -9
- package/dist/rules/spam/entity-swap.js.map +1 -1
- package/dist/rules/spam/template-diversity.d.ts.map +1 -1
- package/dist/rules/spam/template-diversity.js +37 -2
- package/dist/rules/spam/template-diversity.js.map +1 -1
- package/dist/rules/spam/thin-content.d.ts.map +1 -1
- package/dist/rules/spam/thin-content.js +5 -1
- package/dist/rules/spam/thin-content.js.map +1 -1
- package/dist/rules/tech/canonical-consistency.d.ts.map +1 -1
- package/dist/rules/tech/canonical-consistency.js +144 -28
- package/dist/rules/tech/canonical-consistency.js.map +1 -1
- package/dist/rules/tech/og-completeness.d.ts +8 -3
- package/dist/rules/tech/og-completeness.d.ts.map +1 -1
- package/dist/rules/tech/og-completeness.js +15 -7
- package/dist/rules/tech/og-completeness.js.map +1 -1
- package/dist/rules/tech/sitemap-completeness.d.ts +14 -2
- package/dist/rules/tech/sitemap-completeness.d.ts.map +1 -1
- package/dist/rules/tech/sitemap-completeness.js +21 -5
- package/dist/rules/tech/sitemap-completeness.js.map +1 -1
- package/dist/rules/tech/soft-404.d.ts +11 -0
- package/dist/rules/tech/soft-404.d.ts.map +1 -1
- package/dist/rules/tech/soft-404.js +47 -5
- package/dist/rules/tech/soft-404.js.map +1 -1
- package/dist/template-detection.d.ts +1 -0
- package/dist/template-detection.d.ts.map +1 -1
- package/dist/template-detection.js +1 -1
- package/dist/template-detection.js.map +1 -1
- package/dist/types.d.ts +16 -1
- package/dist/types.d.ts.map +1 -1
- package/package.json +109 -93
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"regurgitated-content.js","sourceRoot":"","sources":["../../../src/rules/content/regurgitated-content.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;
|
|
1
|
+
{"version":3,"file":"regurgitated-content.js","sourceRoot":"","sources":["../../../src/rules/content/regurgitated-content.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC,OAAO,EAAE,uBAAuB,EAAE,MAAM,qCAAqC,CAAC;AAE9E,MAAM,OAAO,GAAG,8BAA8B,CAAC;AAE/C;;;;;;;GAOG;AAEH,+DAA+D;AAC/D,MAAM,qBAAqB,GAAG,oBAAoB,CAAC;AAEnD,MAAM,gBAAgB,GAAG;IACvB,uBAAuB;IACvB,2BAA2B;IAC3B,0CAA0C;IAC1C,oCAAoC;CACrC,CAAC;AAEF,MAAM,iBAAiB,GAAG,8CAA8C,CAAC;AACzE,MAAM,iBAAiB,GAAG,iDAAiD,CAAC;AAE5E,MAAM,gBAAgB,GACpB,6FAA6F,CAAC;AAEhG,+EAA+E;AAC/E,kEAAkE;AAClE,MAAM,cAAc,GAAG,sDAAsD,CAAC;AAE9E,MAAM,0BAA0B,GAAG,CAAC,CAAC;AACrC,MAAM,0BAA0B,GAAG,GAAG,CAAC;AACvC,MAAM,iBAAiB,GAAG,CAAC,CAAC;AAO5B,SAAS,sBAAsB,CAAC,CAAqB;IACnD,MAAM,KAAK,GAAG,2BAA2B,CAAC;IAC1C,IAAI,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC;IACxE,MAAM,SAAS,GAAG,CAAC,CAAC,6CAA6C,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;IAC9E,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;AACrC,CAAC;AAED,SAAS,eAAe,CAAC,GAAW;IAClC,OAAO,gBAAgB,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;AAC7D,CAAC;AAED,SAAS,yBAAyB,CAAC,CAAqB;IACtD,MAAM,KAAK,GAAG,uCAAuC,CAAC;IACtD,MAAM,IAAI,GAAG,CAAC,CAAC,UAAU,CAAC;SACvB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;SACvC,GAAG,EAAE;SACL,MAAM,CAAC,OAAO,CAAC,CAAC;IACnB,IAAI,IAAI,CAAC,MAAM,GAAG,0BAA0B;QAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;IAC7E,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,EAAE,KAAK,EAAE,WAAW,GAAG,IAAI,CAAC,MAAM,IAAI,0BAA0B,EAAE,KAAK,EAAE,CAAC;AACnF,CAAC;AAED,SAAS,oBAAoB,CAAC,CAAqB,EAAE,IAAY;IAC/D,MAAM,KAAK,GAAG,kCAAkC,CAAC;IACjD,IAAI,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC;IAChE,MAAM,cAAc,GAClB,CAAC,CAAC,aAAa,CAAC;SACb,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;SACvC,GAAG,EAAE;SACL,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAChD,OAAO,EAAE,KAAK,EAAE,cAAc,EAAE,KAAK,EAAE,CAAC;AAC1C,CAAC;AAED,SAAS,gBAAgB,CAAC,IAAY;IACpC,OAAO,EAAE,KAAK,EAAE,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,uBAAuB,EAAE,CAAC;AAChF,CAAC;AAED,SAAS,eAAe,CAAC,IAAgB;IACvC,MAAM,EAAE,UAAU,EAAE,YAAY,EAAE,aAAa,EAAE,aAAa,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC;IACtF,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,IAAI,UAAU,KAAK,EAAE,IAAI,YAAY,IAAI,aAAa,IAAI,aAAa;QAAE,CAAC,IAAI,CAAC,CAAC;IAChF,IAAI,IAAI,CAAC,aAAa;QAAE,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAAE,CAAC,IAAI,CAAC,CAAC;IACjE,OAAO,CAAC,CAAC;AACX,CAAC;AAED,SAAS,wBAAwB,CAAC,CAAqB,EAAE,IAAY;IACnE,MAAM,KAAK,GAAG,2DAA2D,CAAC;IAC1E,IAAI,IAAI,IAAI,CAAC;QAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;IAC9C,MAAM,KAAK,GAAG,CAAC,CAAC,2BAA2B,CAAC;SACzC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;SACpD,MAAM,CAAC;IACV,OAAO,EAAE,KAAK,EAAE,KAAK,IAAI,iBAAiB,EAAE,KAAK,EAAE,CAAC;AACtD,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,KAAmB;IACzD,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAClC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;QAC7B,IAAI,CAAC,IAAI;YAAE,SAAS;QAEpB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,6EAA6E;QAC7E,4EAA4E;QAC5E,4EAA4E;QAC5E,2EAA2E;QAC3E,6EAA6E;QAC7E,qDAAqD;QACrD,CAAC,CAAC,uBAAuB,CAAC,CAAC,MAAM,EAAE,CAAC;QACpC,MAAM,WAAW,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,OAAO,GAAmB;YAC9B,sBAAsB,CAAC,CAAC,CAAC;YACzB,yBAAyB,CAAC,CAAC,CAAC;YAC5B,oBAAoB,CAAC,CAAC,EAAE,WAAW,CAAC;YACpC,gBAAgB,CAAC,WAAW,CAAC;YAC7B,wBAAwB,CAAC,CAAC,EAAE,IAAI,CAAC;SAClC,CAAC;QAEF,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;QAC7C,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,SAAS;QAE/B,MAAM,UAAU,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxD,QAAQ,CAAC,IAAI,CAAC;YACZ,MAAM,EAAE,OAAO;YACf,QAAQ,EAAE,SAAS;YACnB,OAAO,EAAE,GAAG,IAAI,CAAC,GAAG,KAAK,KAAK,CAAC,MAAM,2CAA2C,UAAU,GAAG;YAC7F,OAAO,EAAE,IAAI,CAAC,GAAG;YACjB,GAAG,EAAE,8TAA8T;YACnU,UAAU,EAAE,aAAa;SAC1B,CAAC,CAAC;IACL,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"translation-no-op.d.ts","sourceRoot":"","sources":["../../../src/rules/content/translation-no-op.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAoC7D;;;;;;;GAOG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,UAAU,EAAE,
|
|
1
|
+
{"version":3,"file":"translation-no-op.d.ts","sourceRoot":"","sources":["../../../src/rules/content/translation-no-op.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAoC7D;;;;;;;GAOG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,UAAU,EAAE,CAoErE"}
|
|
@@ -88,7 +88,11 @@ export function translationNoOpRule(pages) {
|
|
|
88
88
|
: `${(minSim * 100).toFixed(0)}%--${(maxSim * 100).toFixed(0)}%`;
|
|
89
89
|
findings.push({
|
|
90
90
|
ruleId: "content/translation-no-op",
|
|
91
|
-
|
|
91
|
+
// Warning, not error: an untranslated locale variant is a real duplicate-
|
|
92
|
+
// content gap but a should-fix, not a ship-blocker — and multilingual sites
|
|
93
|
+
// can legitimately share some body text (disclaimers, spec tables).
|
|
94
|
+
severity: "warning",
|
|
95
|
+
confidence: "medium",
|
|
92
96
|
message: `${members.length} locale variants of "${basePath}" share identical content ` +
|
|
93
97
|
`(similarity ${simLabel}). Translate the body or consolidate to the canonical version.`,
|
|
94
98
|
pageUrl: urls[0],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"translation-no-op.js","sourceRoot":"","sources":["../../../src/rules/content/translation-no-op.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,sBAAsB,EAAE,MAAM,6BAA6B,CAAC;AAGvG,MAAM,gBAAgB,GAAG,kCAAkC,CAAC;AAC5D,MAAM,oBAAoB,GAAG,IAAI,CAAC;AAClC;;;;;;;GAOG;AACH,MAAM,+BAA+B,GAAG,EAAE,CAAC;AAE3C;;;GAGG;AACH,SAAS,iBAAiB,CAAC,QAAgB;IACzC,MAAM,CAAC,GAAG,gBAAgB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IAC1C,IAAI,CAAC,CAAC;QAAE,OAAO,QAAQ,CAAC;IACxB,oCAAoC;IACpC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AAC9D,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,GAAW;IAC9B,IAAI,CAAC;QACH,OAAO,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,GAAG,CAAC;IACb,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAmB;IACrD,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAElC,4CAA4C;IAC5C,MAAM,MAAM,GAAG,IAAI,GAAG,EAAuD,CAAC;IAE9E,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACvC,MAAM,CAAC,GAAG,gBAAgB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC1C,IAAI,CAAC,CAAC;YAAE,SAAS,CAAC,2BAA2B;QAC7C,MAAM,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;QAClC,MAAM,QAAQ,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAC7C,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QAC1C,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QAC9B,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC/B,CAAC;IAED,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,MAAM,EAAE,CAAC;QACzC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;YAAE,SAAS;QAEjC,uEAAuE;QACvE,uEAAuE;QACvE,wEAAwE;QACxE,kEAAkE;QAClE,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAC/B,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,+BAA+B,CAChG,CAAC;QACF,IAAI,WAAW;YAAE,SAAS;QAE1B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC;QACvE,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,QAAQ,GAAG,KAAK,CAAC;QAErB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;YAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/C,MAAM,GAAG,GAAG,sBAAsB,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC1E,IAAI,GAAG,GAAG,MAAM;oBAAE,MAAM,GAAG,GAAG,CAAC;gBAC/B,IAAI,GAAG,GAAG,MAAM;oBAAE,MAAM,GAAG,GAAG,CAAC;gBAC/B,IAAI,GAAG,IAAI,oBAAoB;oBAAE,QAAQ,GAAG,IAAI,CAAC;YACnD,CAAC;QACH,CAAC;QAED,IAAI,CAAC,QAAQ;YAAE,SAAS;QAExB,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC5C,MAAM,QAAQ,GACZ,MAAM,KAAK,MAAM;YACf,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;YACjC,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QAErE,QAAQ,CAAC,IAAI,CAAC;YACZ,MAAM,EAAE,2BAA2B;YACnC,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"translation-no-op.js","sourceRoot":"","sources":["../../../src/rules/content/translation-no-op.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,sBAAsB,EAAE,MAAM,6BAA6B,CAAC;AAGvG,MAAM,gBAAgB,GAAG,kCAAkC,CAAC;AAC5D,MAAM,oBAAoB,GAAG,IAAI,CAAC;AAClC;;;;;;;GAOG;AACH,MAAM,+BAA+B,GAAG,EAAE,CAAC;AAE3C;;;GAGG;AACH,SAAS,iBAAiB,CAAC,QAAgB;IACzC,MAAM,CAAC,GAAG,gBAAgB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IAC1C,IAAI,CAAC,CAAC;QAAE,OAAO,QAAQ,CAAC;IACxB,oCAAoC;IACpC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AAC9D,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,GAAW;IAC9B,IAAI,CAAC;QACH,OAAO,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,GAAG,CAAC;IACb,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAmB;IACrD,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAElC,4CAA4C;IAC5C,MAAM,MAAM,GAAG,IAAI,GAAG,EAAuD,CAAC;IAE9E,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACvC,MAAM,CAAC,GAAG,gBAAgB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC1C,IAAI,CAAC,CAAC;YAAE,SAAS,CAAC,2BAA2B;QAC7C,MAAM,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;QAClC,MAAM,QAAQ,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAC7C,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QAC1C,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QAC9B,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC/B,CAAC;IAED,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,MAAM,EAAE,CAAC;QACzC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;YAAE,SAAS;QAEjC,uEAAuE;QACvE,uEAAuE;QACvE,wEAAwE;QACxE,kEAAkE;QAClE,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAC/B,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,+BAA+B,CAChG,CAAC;QACF,IAAI,WAAW;YAAE,SAAS;QAE1B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC;QACvE,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,QAAQ,GAAG,KAAK,CAAC;QAErB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;YAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/C,MAAM,GAAG,GAAG,sBAAsB,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC1E,IAAI,GAAG,GAAG,MAAM;oBAAE,MAAM,GAAG,GAAG,CAAC;gBAC/B,IAAI,GAAG,GAAG,MAAM;oBAAE,MAAM,GAAG,GAAG,CAAC;gBAC/B,IAAI,GAAG,IAAI,oBAAoB;oBAAE,QAAQ,GAAG,IAAI,CAAC;YACnD,CAAC;QACH,CAAC;QAED,IAAI,CAAC,QAAQ;YAAE,SAAS;QAExB,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC5C,MAAM,QAAQ,GACZ,MAAM,KAAK,MAAM;YACf,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;YACjC,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QAErE,QAAQ,CAAC,IAAI,CAAC;YACZ,MAAM,EAAE,2BAA2B;YACnC,0EAA0E;YAC1E,4EAA4E;YAC5E,oEAAoE;YACpE,QAAQ,EAAE,SAAS;YACnB,UAAU,EAAE,QAAQ;YACpB,OAAO,EACL,GAAG,OAAO,CAAC,MAAM,wBAAwB,QAAQ,4BAA4B;gBAC7E,eAAe,QAAQ,gEAAgE;YACzF,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;YAChB,WAAW,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;YAC7B,GAAG,EAAE,qKAAqK;SAC3K,CAAC,CAAC;IACL,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -1,3 +1,17 @@
|
|
|
1
1
|
import type { ParsedPage, RuleResult } from "../../types.js";
|
|
2
|
-
export
|
|
2
|
+
export interface UniqueValueThresholds {
|
|
3
|
+
/** Unique-content density below this fires (info). Default 0.20. */
|
|
4
|
+
passBelow: number;
|
|
5
|
+
/** Density below this escalates to error. Default 0.12. */
|
|
6
|
+
errorBelow: number;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Originality as a corpus-relative DENSITY, not an absolute count. Each distinct
|
|
10
|
+
* token is weighted by normalized IDF (ln(N/df)/ln(N)) — 1 if page-exclusive, ~0
|
|
11
|
+
* if on every page — and averaged over the page's distinct tokens. A near-
|
|
12
|
+
* duplicate / boilerplate page scores low regardless of corpus size or length; a
|
|
13
|
+
* large original page stays high. Continuous, so it doesn't shuffle at the margin.
|
|
14
|
+
* Volume is spam/thin-content's job; exact twins are spam/near-duplicate's.
|
|
15
|
+
*/
|
|
16
|
+
export declare function uniqueValueRule(pages: ParsedPage[], thresholds: UniqueValueThresholds): RuleResult[];
|
|
3
17
|
//# sourceMappingURL=unique-value.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"unique-value.d.ts","sourceRoot":"","sources":["../../../src/rules/content/unique-value.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"unique-value.d.ts","sourceRoot":"","sources":["../../../src/rules/content/unique-value.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAE7D,MAAM,WAAW,qBAAqB;IACpC,oEAAoE;IACpE,SAAS,EAAE,MAAM,CAAC;IAClB,2DAA2D;IAC3D,UAAU,EAAE,MAAM,CAAC;CACpB;AAYD;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAC7B,KAAK,EAAE,UAAU,EAAE,EACnB,UAAU,EAAE,qBAAqB,GAChC,UAAU,EAAE,CAwCd"}
|
|
@@ -1,51 +1,58 @@
|
|
|
1
1
|
function tokenize(text) {
|
|
2
|
-
//
|
|
3
|
-
//
|
|
4
|
-
// the "unique" count (a word that's shared but happens to carry a trailing
|
|
5
|
-
// comma on one page looked unique) — false precision in the shared/unique
|
|
6
|
-
// split this rule now surfaces.
|
|
2
|
+
// Lowercase, split on whitespace, strip edge punctuation so "word", "word."
|
|
3
|
+
// and "(word)" are one token.
|
|
7
4
|
return text
|
|
8
5
|
.toLowerCase()
|
|
9
6
|
.split(/\s+/)
|
|
10
7
|
.map((t) => t.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, ""))
|
|
11
8
|
.filter(Boolean);
|
|
12
9
|
}
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
10
|
+
/**
|
|
11
|
+
* Originality as a corpus-relative DENSITY, not an absolute count. Each distinct
|
|
12
|
+
* token is weighted by normalized IDF (ln(N/df)/ln(N)) — 1 if page-exclusive, ~0
|
|
13
|
+
* if on every page — and averaged over the page's distinct tokens. A near-
|
|
14
|
+
* duplicate / boilerplate page scores low regardless of corpus size or length; a
|
|
15
|
+
* large original page stays high. Continuous, so it doesn't shuffle at the margin.
|
|
16
|
+
* Volume is spam/thin-content's job; exact twins are spam/near-duplicate's.
|
|
17
|
+
*/
|
|
18
|
+
export function uniqueValueRule(pages, thresholds) {
|
|
19
|
+
const { passBelow, errorBelow } = thresholds;
|
|
20
|
+
const N = pages.length;
|
|
21
|
+
const lnN = Math.log(N);
|
|
22
|
+
if (N <= 1 || lnN === 0)
|
|
23
|
+
return []; // can't measure rarity against a single page
|
|
24
|
+
const df = new Map();
|
|
25
|
+
const pageDistinct = pages.map((p) => new Set(tokenize(p.contentText)));
|
|
26
|
+
for (const distinct of pageDistinct) {
|
|
27
|
+
for (const t of distinct)
|
|
28
|
+
df.set(t, (df.get(t) ?? 0) + 1);
|
|
20
29
|
}
|
|
21
30
|
const findings = [];
|
|
22
|
-
pages.forEach((page,
|
|
23
|
-
const distinct =
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
});
|
|
48
|
-
}
|
|
31
|
+
pages.forEach((page, i) => {
|
|
32
|
+
const distinct = pageDistinct[i];
|
|
33
|
+
if (distinct.size === 0)
|
|
34
|
+
return; // empty page → thin-content handles it
|
|
35
|
+
let mass = 0;
|
|
36
|
+
for (const t of distinct)
|
|
37
|
+
mass += Math.log(N / (df.get(t) ?? 1)) / lnN;
|
|
38
|
+
const density = mass / distinct.size;
|
|
39
|
+
if (density >= passBelow)
|
|
40
|
+
return;
|
|
41
|
+
const severity = density < errorBelow ? "error" : "info";
|
|
42
|
+
const pct = (density * 100).toFixed(1);
|
|
43
|
+
findings.push({
|
|
44
|
+
ruleId: "content/unique-value",
|
|
45
|
+
severity,
|
|
46
|
+
message: `${page.url} has low unique-content density ${density.toFixed(3)} ` +
|
|
47
|
+
`(${pct}% of its ${distinct.size} distinct words are page-distinctive; floor ${passBelow.toFixed(2)}). ` +
|
|
48
|
+
`Most of its vocabulary also appears on other pages.`,
|
|
49
|
+
pageUrl: page.url,
|
|
50
|
+
fix: `Raise originality density: add page-specific text — a distinct lead, this ` +
|
|
51
|
+
`record's own facts, page-specific examples. Content repeated across pages on ` +
|
|
52
|
+
`the same axis (boilerplate, shared legal/spec blocks, per-axis data like a ` +
|
|
53
|
+
`role's regulations across that role's documents) is common vocabulary and ` +
|
|
54
|
+
`does NOT raise density, even when it is useful.`,
|
|
55
|
+
});
|
|
49
56
|
});
|
|
50
57
|
return findings;
|
|
51
58
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"unique-value.js","sourceRoot":"","sources":["../../../src/rules/content/unique-value.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"unique-value.js","sourceRoot":"","sources":["../../../src/rules/content/unique-value.ts"],"names":[],"mappings":"AASA,SAAS,QAAQ,CAAC,IAAY;IAC5B,4EAA4E;IAC5E,8BAA8B;IAC9B,OAAO,IAAI;SACR,WAAW,EAAE;SACb,KAAK,CAAC,KAAK,CAAC;SACZ,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,mCAAmC,EAAE,EAAE,CAAC,CAAC;SAC9D,MAAM,CAAC,OAAO,CAAC,CAAC;AACrB,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,eAAe,CAC7B,KAAmB,EACnB,UAAiC;IAEjC,MAAM,EAAE,SAAS,EAAE,UAAU,EAAE,GAAG,UAAU,CAAC;IAC7C,MAAM,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;IACvB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;IACxB,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC,CAAC,6CAA6C;IAEjF,MAAM,EAAE,GAAG,IAAI,GAAG,EAAkB,CAAC;IACrC,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;IACxE,KAAK,MAAM,QAAQ,IAAI,YAAY,EAAE,CAAC;QACpC,KAAK,MAAM,CAAC,IAAI,QAAQ;YAAE,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5D,CAAC;IAED,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAClC,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QACxB,MAAM,QAAQ,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;QACjC,IAAI,QAAQ,CAAC,IAAI,KAAK,CAAC;YAAE,OAAO,CAAC,uCAAuC;QACxE,IAAI,IAAI,GAAG,CAAC,CAAC;QACb,KAAK,MAAM,CAAC,IAAI,QAAQ;YAAE,IAAI,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC;QACvE,MAAM,OAAO,GAAG,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;QACrC,IAAI,OAAO,IAAI,SAAS;YAAE,OAAO;QAEjC,MAAM,QAAQ,GAAG,OAAO,GAAG,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC;QACzD,MAAM,GAAG,GAAG,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QACvC,QAAQ,CAAC,IAAI,CAAC;YACZ,MAAM,EAAE,sBAAsB;YAC9B,QAAQ;YACR,OAAO,EACL,GAAG,IAAI,CAAC,GAAG,mCAAmC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;gBACnE,IAAI,GAAG,YAAY,QAAQ,CAAC,IAAI,+CAA+C,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK;gBACxG,qDAAqD;YACvD,OAAO,EAAE,IAAI,CAAC,GAAG;YACjB,GAAG,EACD,4EAA4E;gBAC5E,+EAA+E;gBAC/E,6EAA6E;gBAC7E,4EAA4E;gBAC5E,iDAAiD;SACpD,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IACH,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -6,8 +6,14 @@ import type { ParsedPage, RuleResult } from "../../types.js";
|
|
|
6
6
|
* Aggregates 7 per-page signal scores (originality, freshness, facts,
|
|
7
7
|
* E-E-A-T, translation, cliché-reuse, wikipedia-paraphrase) into a
|
|
8
8
|
* single 0-1 quality score. Each signal weighted equally at 1/7 ≈ 14.3%.
|
|
9
|
-
*
|
|
10
|
-
*
|
|
9
|
+
*
|
|
10
|
+
* E-E-A-T sub-score is a continuous fraction (categoriesPresent/4), not
|
|
11
|
+
* a 3-step value. Reuses countSignalCategories from eeat-signals to avoid
|
|
12
|
+
* logic drift between the two rules.
|
|
13
|
+
*
|
|
14
|
+
* Fires ONE finding per page when score < 0.5:
|
|
15
|
+
* - warning (score ∈ [0.35, 0.5)) — borderline, low confidence
|
|
16
|
+
* - error (score < 0.35) — clearly low value-add
|
|
11
17
|
*/
|
|
12
18
|
export declare function valueAddRule(pages: ParsedPage[], findings: RuleResult[]): RuleResult[];
|
|
13
19
|
//# sourceMappingURL=value-add.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"value-add.d.ts","sourceRoot":"","sources":["../../../src/rules/content/value-add.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAY,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"value-add.d.ts","sourceRoot":"","sources":["../../../src/rules/content/value-add.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAY,MAAM,gBAAgB,CAAC;AA0HvE;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,YAAY,CAAC,KAAK,EAAE,UAAU,EAAE,EAAE,QAAQ,EAAE,UAAU,EAAE,GAAG,UAAU,EAAE,CAoBtF"}
|
|
@@ -1,26 +1,6 @@
|
|
|
1
1
|
import { hasAuthoritativeCitation } from "../../algorithms/fact-extraction.js";
|
|
2
|
+
import { countSignalCategories } from "./eeat-signals.js";
|
|
2
3
|
const RULE_ID = "content/value-add";
|
|
3
|
-
const EEAT_HTML_PATTERNS = [
|
|
4
|
-
/last\s+updated/i,
|
|
5
|
-
/last\s+modified/i,
|
|
6
|
-
/reviewed\s+by/i,
|
|
7
|
-
/\bsources:/i,
|
|
8
|
-
/\breferences:/i,
|
|
9
|
-
];
|
|
10
|
-
function countEeatCategories(page) {
|
|
11
|
-
let count = 0;
|
|
12
|
-
if (page.resolvedHrefs.some((h) => /\/about\b/i.test(h)))
|
|
13
|
-
count += 1;
|
|
14
|
-
const { metaAuthor, schemaAuthor, bylineElement, relAuthorLink } = page.authorSignals;
|
|
15
|
-
if (metaAuthor !== "" || schemaAuthor || bylineElement || relAuthorLink)
|
|
16
|
-
count += 1;
|
|
17
|
-
if (page.publishedDate)
|
|
18
|
-
count += 1;
|
|
19
|
-
if (EEAT_HTML_PATTERNS.some((p) => p.test(page.html)) ||
|
|
20
|
-
hasAuthoritativeCitation(page.resolvedHrefs, page.url))
|
|
21
|
-
count += 1;
|
|
22
|
-
return count;
|
|
23
|
-
}
|
|
24
4
|
function computeSignals(page, allFindings) {
|
|
25
5
|
const pageFindings = allFindings.filter((f) => f.pageUrl === page.url);
|
|
26
6
|
// Originality: 1.0 if regurgitated-content doesn't fire, 0.0 if it does
|
|
@@ -50,18 +30,14 @@ function computeSignals(page, allFindings) {
|
|
|
50
30
|
else {
|
|
51
31
|
facts = 0.0;
|
|
52
32
|
}
|
|
53
|
-
// E-E-A-T:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
}
|
|
62
|
-
else {
|
|
63
|
-
eeat = 0.0;
|
|
64
|
-
}
|
|
33
|
+
// E-E-A-T: continuous fraction of 4 categories present.
|
|
34
|
+
// Reuses countSignalCategories from eeat-signals (no duplicate logic).
|
|
35
|
+
// Also grants the "sources" credit for authoritative outbound citations.
|
|
36
|
+
const eeatCount = countSignalCategories(page);
|
|
37
|
+
const hasCitation = hasAuthoritativeCitation(page.resolvedHrefs, page.url);
|
|
38
|
+
// Clamp to max 4 after adding citation credit (if sources category already counted it won't double-count)
|
|
39
|
+
const effectiveEeatCount = Math.min(4, eeatCount + (hasCitation && eeatCount < 4 ? 1 : 0));
|
|
40
|
+
const eeat = effectiveEeatCount / 4;
|
|
65
41
|
// Translation: 1.0 unless translation-no-op lists this page
|
|
66
42
|
const hasTranslationNoOp = allFindings.some((f) => f.ruleId === "content/translation-no-op" &&
|
|
67
43
|
(f.pageUrl === page.url || (f.relatedUrls ?? []).includes(page.url)));
|
|
@@ -69,13 +45,7 @@ function computeSignals(page, allFindings) {
|
|
|
69
45
|
// Cliché reuse (signal 6): 1.0 if common-phrase-reuse doesn't fire, 0.0 if it does
|
|
70
46
|
const hasClicheReuse = pageFindings.some((f) => f.ruleId === "content/common-phrase-reuse");
|
|
71
47
|
const clicheReuse = hasClicheReuse ? 0.0 : 1.0;
|
|
72
|
-
// Wikipedia paraphrase (signal 7
|
|
73
|
-
// fire on this page, 0.0 if it does. The rule fires at warning/low when
|
|
74
|
-
// page text overlaps ≥40% with the bundled trigram corpus — a real signal
|
|
75
|
-
// for "content lifted from Wikipedia," orthogonal to the other 6 originality
|
|
76
|
-
// proxies. Adding it shifts each signal's weight from 1/6 (16.7%) to 1/7
|
|
77
|
-
// (14.3%) — boundary cases at score=0.30 and score=0.50 may shift by
|
|
78
|
-
// ±0.024 per signal, which is below the granularity of severity bands.
|
|
48
|
+
// Wikipedia paraphrase (signal 7): 1.0 if wikipedia-paraphrase doesn't fire, 0.0 if it does
|
|
79
49
|
const hasWikipediaParaphrase = pageFindings.some((f) => f.ruleId === "content/wikipedia-paraphrase");
|
|
80
50
|
const wikipediaParaphrase = hasWikipediaParaphrase ? 0.0 : 1.0;
|
|
81
51
|
return { originality, freshness, facts, eeat, translation, clicheReuse, wikipediaParaphrase };
|
|
@@ -92,10 +62,24 @@ function meanScore(signals) {
|
|
|
92
62
|
];
|
|
93
63
|
return values.reduce((a, b) => a + b, 0) / values.length;
|
|
94
64
|
}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
65
|
+
/**
|
|
66
|
+
* Two-band severity for the composite score:
|
|
67
|
+
* - score in [0.35, 0.5) → "warning" (borderline: page is weak but not egregiously thin)
|
|
68
|
+
* - score < 0.35 → "error" (clearly low value-add)
|
|
69
|
+
*
|
|
70
|
+
* Confidence scales with distance from the fire threshold:
|
|
71
|
+
* - score < 0.2 → "high"
|
|
72
|
+
* - score in [0.2, 0.35) → "medium"
|
|
73
|
+
* - score in [0.35, 0.5) → "low" (borderline warning)
|
|
74
|
+
*/
|
|
75
|
+
function severityAndConfidence(score) {
|
|
76
|
+
if (score >= 0.35) {
|
|
77
|
+
return { severity: "warning", confidence: "low" };
|
|
78
|
+
}
|
|
79
|
+
if (score < 0.2) {
|
|
80
|
+
return { severity: "error", confidence: "high" };
|
|
81
|
+
}
|
|
82
|
+
return { severity: "error", confidence: "medium" };
|
|
99
83
|
}
|
|
100
84
|
function buildMessage(page, score, signals) {
|
|
101
85
|
const pct = (v) => `${(v * 100).toFixed(0)}%`;
|
|
@@ -119,8 +103,14 @@ function buildMessage(page, score, signals) {
|
|
|
119
103
|
* Aggregates 7 per-page signal scores (originality, freshness, facts,
|
|
120
104
|
* E-E-A-T, translation, cliché-reuse, wikipedia-paraphrase) into a
|
|
121
105
|
* single 0-1 quality score. Each signal weighted equally at 1/7 ≈ 14.3%.
|
|
122
|
-
*
|
|
123
|
-
*
|
|
106
|
+
*
|
|
107
|
+
* E-E-A-T sub-score is a continuous fraction (categoriesPresent/4), not
|
|
108
|
+
* a 3-step value. Reuses countSignalCategories from eeat-signals to avoid
|
|
109
|
+
* logic drift between the two rules.
|
|
110
|
+
*
|
|
111
|
+
* Fires ONE finding per page when score < 0.5:
|
|
112
|
+
* - warning (score ∈ [0.35, 0.5)) — borderline, low confidence
|
|
113
|
+
* - error (score < 0.35) — clearly low value-add
|
|
124
114
|
*/
|
|
125
115
|
export function valueAddRule(pages, findings) {
|
|
126
116
|
const results = [];
|
|
@@ -129,10 +119,11 @@ export function valueAddRule(pages, findings) {
|
|
|
129
119
|
const score = meanScore(signals);
|
|
130
120
|
if (score >= 0.5)
|
|
131
121
|
continue;
|
|
122
|
+
const { severity, confidence } = severityAndConfidence(score);
|
|
132
123
|
results.push({
|
|
133
124
|
ruleId: RULE_ID,
|
|
134
|
-
severity
|
|
135
|
-
confidence
|
|
125
|
+
severity,
|
|
126
|
+
confidence,
|
|
136
127
|
message: buildMessage(page, score, signals),
|
|
137
128
|
fix: "Add proprietary content (original analysis, primary-source data, expert commentary, original imagery) to lift the value-add score above 0.5. Score is a composite — improve any underweight signal.",
|
|
138
129
|
pageUrl: page.url,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"value-add.js","sourceRoot":"","sources":["../../../src/rules/content/value-add.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,wBAAwB,EAAE,MAAM,qCAAqC,CAAC;
|
|
1
|
+
{"version":3,"file":"value-add.js","sourceRoot":"","sources":["../../../src/rules/content/value-add.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,wBAAwB,EAAE,MAAM,qCAAqC,CAAC;AAC/E,OAAO,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAE1D,MAAM,OAAO,GAAG,mBAAmB,CAAC;AAYpC,SAAS,cAAc,CAAC,IAAgB,EAAE,WAAyB;IACjE,MAAM,YAAY,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC;IAEvE,wEAAwE;IACxE,MAAM,eAAe,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,8BAA8B,CAAC,CAAC;IAC9F,MAAM,WAAW,GAAG,eAAe,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IAEhD,qDAAqD;IACrD,MAAM,gBAAgB,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,uBAAuB,CAAC,CAAC;IACxF,IAAI,SAAiB,CAAC;IACtB,IAAI,CAAC,gBAAgB,EAAE,CAAC;QACtB,SAAS,GAAG,GAAG,CAAC;IAClB,CAAC;SAAM,IAAI,gBAAgB,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;QACnD,SAAS,GAAG,GAAG,CAAC;IAClB,CAAC;SAAM,CAAC;QACN,SAAS,GAAG,GAAG,CAAC;IAClB,CAAC;IAED,qDAAqD;IACrD,MAAM,YAAY,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,mBAAmB,CAAC,CAAC;IAChF,IAAI,KAAa,CAAC;IAClB,IAAI,CAAC,YAAY,EAAE,CAAC;QAClB,KAAK,GAAG,GAAG,CAAC;IACd,CAAC;SAAM,IAAI,YAAY,CAAC,QAAQ,KAAK,MAAM,IAAI,YAAY,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;QACnF,KAAK,GAAG,GAAG,CAAC;IACd,CAAC;SAAM,CAAC;QACN,KAAK,GAAG,GAAG,CAAC;IACd,CAAC;IAED,wDAAwD;IACxD,uEAAuE;IACvE,yEAAyE;IACzE,MAAM,SAAS,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,WAAW,GAAG,wBAAwB,CAAC,IAAI,CAAC,aAAa,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC;IAC3E,0GAA0G;IAC1G,MAAM,kBAAkB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,GAAG,CAAC,WAAW,IAAI,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC3F,MAAM,IAAI,GAAG,kBAAkB,GAAG,CAAC,CAAC;IAEpC,4DAA4D;IAC5D,MAAM,kBAAkB,GAAG,WAAW,CAAC,IAAI,CACzC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,2BAA2B;QAC7C,CAAC,CAAC,CAAC,OAAO,KAAK,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CACvE,CAAC;IACF,MAAM,WAAW,GAAG,kBAAkB,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IAEnD,mFAAmF;IACnF,MAAM,cAAc,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,6BAA6B,CAAC,CAAC;IAC5F,MAAM,WAAW,GAAG,cAAc,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IAE/C,4FAA4F;IAC5F,MAAM,sBAAsB,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,8BAA8B,CAAC,CAAC;IACrG,MAAM,mBAAmB,GAAG,sBAAsB,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IAE/D,OAAO,EAAE,WAAW,EAAE,SAAS,EAAE,KAAK,EAAE,IAAI,EAAE,WAAW,EAAE,WAAW,EAAE,mBAAmB,EAAE,CAAC;AAChG,CAAC;AAED,SAAS,SAAS,CAAC,OAAgB;IACjC,MAAM,MAAM,GAAG;QACb,OAAO,CAAC,WAAW;QACnB,OAAO,CAAC,SAAS;QACjB,OAAO,CAAC,KAAK;QACb,OAAO,CAAC,IAAI;QACZ,OAAO,CAAC,WAAW;QACnB,OAAO,CAAC,WAAW;QACnB,OAAO,CAAC,mBAAmB;KAC5B,CAAC;IACF,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;AAC3D,CAAC;AAED;;;;;;;;;GASG;AACH,SAAS,qBAAqB,CAAC,KAAa;IAC1C,IAAI,KAAK,IAAI,IAAI,EAAE,CAAC;QAClB,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,KAAK,EAAE,CAAC;IACpD,CAAC;IACD,IAAI,KAAK,GAAG,GAAG,EAAE,CAAC;QAChB,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,CAAC;IACnD,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,QAAQ,EAAE,CAAC;AACrD,CAAC;AAED,SAAS,YAAY,CAAC,IAAgB,EAAE,KAAa,EAAE,OAAgB;IACrE,MAAM,GAAG,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;IACtD,MAAM,YAAY,GAAa,EAAE,CAAC;IAClC,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,CAA8B,CAAC;IACrE,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,OAAO,EAAE,CAAC;QACjC,IAAI,GAAG,GAAG,GAAG;YAAE,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACxC,CAAC;IACD,MAAM,UAAU,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,kBAAkB,CAAC;IAC1F,OAAO,CACL,GAAG,IAAI,CAAC,GAAG,qBAAqB,GAAG,CAAC,KAAK,CAAC,kBAAkB;QAC5D,iBAAiB,GAAG,CAAC,OAAO,CAAC,WAAW,CAAC,gBAAgB,GAAG,CAAC,OAAO,CAAC,SAAS,CAAC,IAAI;QACnF,UAAU,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,cAAc,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,OAAO,CAAC,WAAW,CAAC,IAAI;QACzG,iBAAiB,GAAG,CAAC,OAAO,CAAC,WAAW,CAAC,2BAA2B,GAAG,CAAC,OAAO,CAAC,mBAAmB,CAAC,KAAK;QACzG,kBAAkB,UAAU,iEAAiE,CAC9F,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;;;GAeG;AACH,MAAM,UAAU,YAAY,CAAC,KAAmB,EAAE,QAAsB;IACtE,MAAM,OAAO,GAAiB,EAAE,CAAC;IAEjC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;QAC/C,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC;QACjC,IAAI,KAAK,IAAI,GAAG;YAAE,SAAS;QAE3B,MAAM,EAAE,QAAQ,EAAE,UAAU,EAAE,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;QAC9D,OAAO,CAAC,IAAI,CAAC;YACX,MAAM,EAAE,OAAO;YACf,QAAQ;YACR,UAAU;YACV,OAAO,EAAE,YAAY,CAAC,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC;YAC3C,GAAG,EAAE,qMAAqM;YAC1M,OAAO,EAAE,IAAI,CAAC,GAAG;SAClB,CAAC,CAAC;IACL,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
import type { ParsedPage, RuleResult } from "../../types.js";
|
|
2
2
|
/**
|
|
3
|
-
* content/wikipedia-paraphrase —
|
|
3
|
+
* content/wikipedia-paraphrase — advisory originality signal (v0.5.14+).
|
|
4
4
|
*
|
|
5
|
-
* Detects pages whose contentText has high trigram overlap with the
|
|
6
|
-
* Wikipedia reference corpus.
|
|
7
|
-
*
|
|
5
|
+
* Detects pages whose contentText has unusually high trigram overlap with the
|
|
6
|
+
* bundled Wikipedia reference corpus. This is a weak, advisory signal only:
|
|
7
|
+
* trigram overlap cannot distinguish actual paraphrase from legitimate topical
|
|
8
|
+
* proximity (e.g. a legal-template page naturally shares many encyclopedic
|
|
9
|
+
* trigrams with Wikipedia articles on the same topic).
|
|
8
10
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
+
* Two guards reduce false positives:
|
|
12
|
+
* 1. Minimum-length guard: pages below MIN_TRIGRAM_COUNT trigrams (~200
|
|
13
|
+
* words) are skipped entirely — bloom noise alone dominates on short pages.
|
|
14
|
+
* 2. Raised threshold: THRESHOLD = 0.55, well above the bloom noise floor
|
|
15
|
+
* (~5%) and typical topical-proximity baseline.
|
|
11
16
|
*
|
|
12
|
-
* Fires: one warning/low-confidence finding per qualifying page (rate >= 0.
|
|
17
|
+
* Fires: one warning/low-confidence finding per qualifying page (rate >= 0.55).
|
|
13
18
|
*/
|
|
14
19
|
export declare function wikipediaParaphraseRule(pages: ParsedPage[]): RuleResult[];
|
|
15
20
|
//# sourceMappingURL=wikipedia-paraphrase.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"wikipedia-paraphrase.d.ts","sourceRoot":"","sources":["../../../src/rules/content/wikipedia-paraphrase.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"wikipedia-paraphrase.d.ts","sourceRoot":"","sources":["../../../src/rules/content/wikipedia-paraphrase.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAkC7D;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,UAAU,EAAE,CAgCzE"}
|
|
@@ -1,23 +1,61 @@
|
|
|
1
1
|
import { wikipediaParaphraseRate } from "../../algorithms/wikipedia-paraphrase.js";
|
|
2
2
|
const RULE_ID = "content/wikipedia-paraphrase";
|
|
3
|
-
const THRESHOLD = 0.4;
|
|
4
3
|
/**
|
|
5
|
-
*
|
|
4
|
+
* ponytail: MIN_TRIGRAM_COUNT = 200
|
|
6
5
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
6
|
+
* The bloom filter has a ~5% per-query false-positive rate. On a page with
|
|
7
|
+
* N trigrams the expected bloom-noise hit count is 0.05 * N. For a short page
|
|
8
|
+
* (~48 trigrams) that alone produces ~2.4 expected FP hits; with a threshold
|
|
9
|
+
* of 40% (19/48) the noise alone can exceed the threshold on short pages.
|
|
10
10
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
11
|
+
* Setting a floor of 200 trigrams (~202 words) means bloom noise contributes
|
|
12
|
+
* at most 10 / 200 = 5% of trigrams, far below the raised THRESHOLD, so noise
|
|
13
|
+
* cannot trigger the rule on its own.
|
|
14
|
+
*/
|
|
15
|
+
const MIN_TRIGRAM_COUNT = 200;
|
|
16
|
+
/**
|
|
17
|
+
* ponytail: THRESHOLD = 0.55
|
|
18
|
+
*
|
|
19
|
+
* Raised from 0.40 to 0.55 to account for the bloom filter's ~5% per-query
|
|
20
|
+
* FP rate and the "topic overlap" effect: legal/medical/geography pSEO pages
|
|
21
|
+
* share many encyclopedic trigrams ("the united states", "in the state of")
|
|
22
|
+
* purely through topical proximity, not paraphrase. A 55% overlap is
|
|
23
|
+
* substantially above both the noise floor (~5%) and the expected topic-
|
|
24
|
+
* overlap baseline, making the signal meaningfully indicative of genuine
|
|
25
|
+
* encyclopedic reuse. At this level the rule remains advisory (confidence:
|
|
26
|
+
* "low") because trigram overlap cannot distinguish paraphrase from topic
|
|
27
|
+
* proximity — it is a weak signal, not a verdict.
|
|
28
|
+
*/
|
|
29
|
+
const THRESHOLD = 0.55;
|
|
30
|
+
/**
|
|
31
|
+
* content/wikipedia-paraphrase — advisory originality signal (v0.5.14+).
|
|
32
|
+
*
|
|
33
|
+
* Detects pages whose contentText has unusually high trigram overlap with the
|
|
34
|
+
* bundled Wikipedia reference corpus. This is a weak, advisory signal only:
|
|
35
|
+
* trigram overlap cannot distinguish actual paraphrase from legitimate topical
|
|
36
|
+
* proximity (e.g. a legal-template page naturally shares many encyclopedic
|
|
37
|
+
* trigrams with Wikipedia articles on the same topic).
|
|
13
38
|
*
|
|
14
|
-
*
|
|
39
|
+
* Two guards reduce false positives:
|
|
40
|
+
* 1. Minimum-length guard: pages below MIN_TRIGRAM_COUNT trigrams (~200
|
|
41
|
+
* words) are skipped entirely — bloom noise alone dominates on short pages.
|
|
42
|
+
* 2. Raised threshold: THRESHOLD = 0.55, well above the bloom noise floor
|
|
43
|
+
* (~5%) and typical topical-proximity baseline.
|
|
44
|
+
*
|
|
45
|
+
* Fires: one warning/low-confidence finding per qualifying page (rate >= 0.55).
|
|
15
46
|
*/
|
|
16
47
|
export function wikipediaParaphraseRule(pages) {
|
|
17
48
|
const findings = [];
|
|
18
49
|
for (const page of pages) {
|
|
19
50
|
if (!page.contentText || page.contentText.trim().length === 0)
|
|
20
51
|
continue;
|
|
52
|
+
// Estimate trigram count without re-implementing extractTrigrams: count
|
|
53
|
+
// whitespace-separated tokens then subtract 2 (trigrams = tokens - 2).
|
|
54
|
+
// This is a cheap proxy; the algorithm file does the accurate extraction.
|
|
55
|
+
const tokenCount = page.contentText.trim().split(/\s+/).length;
|
|
56
|
+
const estimatedTrigrams = Math.max(0, tokenCount - 2);
|
|
57
|
+
if (estimatedTrigrams < MIN_TRIGRAM_COUNT)
|
|
58
|
+
continue;
|
|
21
59
|
const rate = wikipediaParaphraseRate(page.contentText);
|
|
22
60
|
if (rate < THRESHOLD)
|
|
23
61
|
continue;
|
|
@@ -27,11 +65,12 @@ export function wikipediaParaphraseRule(pages) {
|
|
|
27
65
|
severity: "warning",
|
|
28
66
|
confidence: "low",
|
|
29
67
|
pageUrl: page.url,
|
|
30
|
-
message: `${page.url}
|
|
31
|
-
`reference corpus.
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
"
|
|
68
|
+
message: `${page.url} has high trigram overlap (${pct}%) with the bundled Wikipedia ` +
|
|
69
|
+
`reference corpus. This is an advisory signal — trigram overlap can reflect ` +
|
|
70
|
+
`topical proximity as well as copied content and cannot distinguish the two.`,
|
|
71
|
+
fix: "Review for borrowed encyclopedic phrasing and replace with original analysis " +
|
|
72
|
+
"specific to this page's subject. Even if attributed, high paraphrase rates " +
|
|
73
|
+
"correlate with low value-add by SpamBrain's helpful-content metric.",
|
|
35
74
|
});
|
|
36
75
|
}
|
|
37
76
|
return findings;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"wikipedia-paraphrase.js","sourceRoot":"","sources":["../../../src/rules/content/wikipedia-paraphrase.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,uBAAuB,EAAE,MAAM,0CAA0C,CAAC;AAEnF,MAAM,OAAO,GAAG,8BAA8B,CAAC;
|
|
1
|
+
{"version":3,"file":"wikipedia-paraphrase.js","sourceRoot":"","sources":["../../../src/rules/content/wikipedia-paraphrase.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,uBAAuB,EAAE,MAAM,0CAA0C,CAAC;AAEnF,MAAM,OAAO,GAAG,8BAA8B,CAAC;AAE/C;;;;;;;;;;;GAWG;AACH,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAE9B;;;;;;;;;;;;GAYG;AACH,MAAM,SAAS,GAAG,IAAI,CAAC;AAEvB;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,UAAU,uBAAuB,CAAC,KAAmB;IACzD,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAClC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAExE,wEAAwE;QACxE,uEAAuE;QACvE,0EAA0E;QAC1E,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;QAC/D,MAAM,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC;QACtD,IAAI,iBAAiB,GAAG,iBAAiB;YAAE,SAAS;QAEpD,MAAM,IAAI,GAAG,uBAAuB,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACvD,IAAI,IAAI,GAAG,SAAS;YAAE,SAAS;QAE/B,MAAM,GAAG,GAAG,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QACpC,QAAQ,CAAC,IAAI,CAAC;YACZ,MAAM,EAAE,OAAO;YACf,QAAQ,EAAE,SAAS;YACnB,UAAU,EAAE,KAAK;YACjB,OAAO,EAAE,IAAI,CAAC,GAAG;YACjB,OAAO,EACL,GAAG,IAAI,CAAC,GAAG,8BAA8B,GAAG,gCAAgC;gBAC5E,6EAA6E;gBAC7E,6EAA6E;YAC/E,GAAG,EACD,+EAA+E;gBAC/E,6EAA6E;gBAC7E,qEAAqE;SACxE,CAAC,CAAC;IACL,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -3,5 +3,11 @@ import type { ParsedPage, RuleResult } from "../../types.js";
|
|
|
3
3
|
* Flags clusters (same parent directory) with 2+ pages that are siloed: no outbound
|
|
4
4
|
* internal crawl link to another cluster and no inbound from another cluster.
|
|
5
5
|
*/
|
|
6
|
-
export declare function clusterConnectivityRule(pages: ParsedPage[], knownUrls: Set<string
|
|
6
|
+
export declare function clusterConnectivityRule(pages: ParsedPage[], knownUrls: Set<string>,
|
|
7
|
+
/**
|
|
8
|
+
* 2026-06-16 calibration FP fix: cross-cluster links routinely target pages
|
|
9
|
+
* that were not fetched on a sampled crawl, so a "siloed cluster" verdict is
|
|
10
|
+
* unreliable. Only run on a full crawl.
|
|
11
|
+
*/
|
|
12
|
+
sampled?: boolean): RuleResult[];
|
|
7
13
|
//# sourceMappingURL=cluster-connectivity.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cluster-connectivity.d.ts","sourceRoot":"","sources":["../../../src/rules/links/cluster-connectivity.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAyB7D;;;GAGG;AACH,wBAAgB,uBAAuB,CACrC,KAAK,EAAE,UAAU,EAAE,EACnB,SAAS,EAAE,GAAG,CAAC,MAAM,CAAC,
|
|
1
|
+
{"version":3,"file":"cluster-connectivity.d.ts","sourceRoot":"","sources":["../../../src/rules/links/cluster-connectivity.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAyB7D;;;GAGG;AACH,wBAAgB,uBAAuB,CACrC,KAAK,EAAE,UAAU,EAAE,EACnB,SAAS,EAAE,GAAG,CAAC,MAAM,CAAC;AACtB;;;;GAIG;AACH,OAAO,UAAQ,GACd,UAAU,EAAE,CA0Dd"}
|
|
@@ -19,8 +19,14 @@ function hasCrossClusterInbound(clusterDir, urlsInCluster, pages, knownUrls) {
|
|
|
19
19
|
* Flags clusters (same parent directory) with 2+ pages that are siloed: no outbound
|
|
20
20
|
* internal crawl link to another cluster and no inbound from another cluster.
|
|
21
21
|
*/
|
|
22
|
-
export function clusterConnectivityRule(pages, knownUrls
|
|
23
|
-
|
|
22
|
+
export function clusterConnectivityRule(pages, knownUrls,
|
|
23
|
+
/**
|
|
24
|
+
* 2026-06-16 calibration FP fix: cross-cluster links routinely target pages
|
|
25
|
+
* that were not fetched on a sampled crawl, so a "siloed cluster" verdict is
|
|
26
|
+
* unreliable. Only run on a full crawl.
|
|
27
|
+
*/
|
|
28
|
+
sampled = false) {
|
|
29
|
+
if (sampled || pages.length < 2) {
|
|
24
30
|
return [];
|
|
25
31
|
}
|
|
26
32
|
const clusterPages = new Map();
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cluster-connectivity.js","sourceRoot":"","sources":["../../../src/rules/links/cluster-connectivity.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAEpD,SAAS,sBAAsB,CAC7B,UAAkB,EAClB,aAA0B,EAC1B,KAAmB,EACnB,SAAsB;IAEtB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YAChC,SAAS;QACX,CAAC;QACD,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACtC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBACzB,SAAS;YACX,CAAC;YACD,IAAI,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC5B,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CACrC,KAAmB,EACnB,SAAsB;
|
|
1
|
+
{"version":3,"file":"cluster-connectivity.js","sourceRoot":"","sources":["../../../src/rules/links/cluster-connectivity.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAEpD,SAAS,sBAAsB,CAC7B,UAAkB,EAClB,aAA0B,EAC1B,KAAmB,EACnB,SAAsB;IAEtB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YAChC,SAAS;QACX,CAAC;QACD,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACtC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBACzB,SAAS;YACX,CAAC;YACD,IAAI,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC5B,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CACrC,KAAmB,EACnB,SAAsB;AACtB;;;;GAIG;AACH,OAAO,GAAG,KAAK;IAEf,IAAI,OAAO,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAChC,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,YAAY,GAAG,IAAI,GAAG,EAAuB,CAAC;IACpD,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,gBAAgB,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QACpC,MAAM,GAAG,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,IAAI,GAAG,EAAU,CAAC;QACvD,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QACf,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,IAAI,YAAY,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;QAC1B,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAElC,KAAK,MAAM,CAAC,UAAU,EAAE,IAAI,CAAC,IAAI,YAAY,CAAC,OAAO,EAAE,EAAE,CAAC;QACxD,IAAI,IAAI,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;YAClB,SAAS;QACX,CAAC;QAED,IAAI,uBAAuB,GAAG,KAAK,CAAC;QACpC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACxB,SAAS;YACX,CAAC;YACD,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;gBACtC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBACzB,SAAS;gBACX,CAAC;gBACD,MAAM,aAAa,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;gBAC7C,IAAI,aAAa,KAAK,UAAU,EAAE,CAAC;oBACjC,uBAAuB,GAAG,IAAI,CAAC;oBAC/B,MAAM;gBACR,CAAC;YACH,CAAC;YACD,IAAI,uBAAuB,EAAE,CAAC;gBAC5B,MAAM;YACR,CAAC;QACH,CAAC;QAED,MAAM,UAAU,GAAG,sBAAsB,CAAC,UAAU,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,CAAC,CAAC;QAE9E,IAAI,CAAC,uBAAuB,IAAI,CAAC,UAAU,EAAE,CAAC;YAC5C,QAAQ,CAAC,IAAI,CAAC;gBACZ,MAAM,EAAE,4BAA4B;gBACpC,QAAQ,EAAE,SAAS;gBACnB,OAAO,EAAE,WAAW,UAAU,KAAK,IAAI,CAAC,IAAI,uDAAuD;gBACnG,WAAW,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE;gBACpC,GAAG,EAAE,kGAAkG;aACxG,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -1,3 +1,10 @@
|
|
|
1
1
|
import type { ParsedPage, RuleResult } from "../../types.js";
|
|
2
|
-
export declare function orphanPagesRule(pages: ParsedPage[], inboundLinks: Map<string, number>, rootUrl?: string
|
|
2
|
+
export declare function orphanPagesRule(pages: ParsedPage[], inboundLinks: Map<string, number>, rootUrl?: string,
|
|
3
|
+
/**
|
|
4
|
+
* 2026-06-16 calibration FP fix: on a sampled crawl the page that links to a
|
|
5
|
+
* given URL is often simply not in the fetched subset, so "0 inbound in this
|
|
6
|
+
* crawl" is not evidence of a real orphan. Orphan detection is only reliable
|
|
7
|
+
* on a full crawl — skip it when sampled rather than flag healthy pages.
|
|
8
|
+
*/
|
|
9
|
+
sampled?: boolean): RuleResult[];
|
|
3
10
|
//# sourceMappingURL=orphan-pages.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"orphan-pages.d.ts","sourceRoot":"","sources":["../../../src/rules/links/orphan-pages.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAE7D,wBAAgB,eAAe,CAC7B,KAAK,EAAE,UAAU,EAAE,EACnB,YAAY,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACjC,OAAO,CAAC,EAAE,MAAM,
|
|
1
|
+
{"version":3,"file":"orphan-pages.d.ts","sourceRoot":"","sources":["../../../src/rules/links/orphan-pages.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAE7D,wBAAgB,eAAe,CAC7B,KAAK,EAAE,UAAU,EAAE,EACnB,YAAY,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACjC,OAAO,CAAC,EAAE,MAAM;AAChB;;;;;GAKG;AACH,OAAO,UAAQ,GACd,UAAU,EAAE,CAqBd"}
|
|
@@ -1,4 +1,13 @@
|
|
|
1
|
-
export function orphanPagesRule(pages, inboundLinks, rootUrl
|
|
1
|
+
export function orphanPagesRule(pages, inboundLinks, rootUrl,
|
|
2
|
+
/**
|
|
3
|
+
* 2026-06-16 calibration FP fix: on a sampled crawl the page that links to a
|
|
4
|
+
* given URL is often simply not in the fetched subset, so "0 inbound in this
|
|
5
|
+
* crawl" is not evidence of a real orphan. Orphan detection is only reliable
|
|
6
|
+
* on a full crawl — skip it when sampled rather than flag healthy pages.
|
|
7
|
+
*/
|
|
8
|
+
sampled = false) {
|
|
9
|
+
if (sampled)
|
|
10
|
+
return [];
|
|
2
11
|
const findings = [];
|
|
3
12
|
for (const page of pages) {
|
|
4
13
|
if (rootUrl && page.url === rootUrl) {
|