@debriefer/sources 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -0
- package/dist/__tests__/archives/chronicling-america.test.d.ts +8 -0
- package/dist/__tests__/archives/chronicling-america.test.d.ts.map +1 -0
- package/dist/__tests__/archives/chronicling-america.test.js +151 -0
- package/dist/__tests__/archives/chronicling-america.test.js.map +1 -0
- package/dist/__tests__/archives/europeana.test.d.ts +8 -0
- package/dist/__tests__/archives/europeana.test.d.ts.map +1 -0
- package/dist/__tests__/archives/europeana.test.js +200 -0
- package/dist/__tests__/archives/europeana.test.js.map +1 -0
- package/dist/__tests__/archives/internet-archive.test.d.ts +8 -0
- package/dist/__tests__/archives/internet-archive.test.d.ts.map +1 -0
- package/dist/__tests__/archives/internet-archive.test.js +189 -0
- package/dist/__tests__/archives/internet-archive.test.js.map +1 -0
- package/dist/__tests__/archives/trove.test.d.ts +8 -0
- package/dist/__tests__/archives/trove.test.d.ts.map +1 -0
- package/dist/__tests__/archives/trove.test.js +202 -0
- package/dist/__tests__/archives/trove.test.js.map +1 -0
- package/dist/__tests__/books/google-books.test.d.ts +8 -0
- package/dist/__tests__/books/google-books.test.d.ts.map +1 -0
- package/dist/__tests__/books/google-books.test.js +221 -0
- package/dist/__tests__/books/google-books.test.js.map +1 -0
- package/dist/__tests__/books/open-library.test.d.ts +8 -0
- package/dist/__tests__/books/open-library.test.d.ts.map +1 -0
- package/dist/__tests__/books/open-library.test.js +159 -0
- package/dist/__tests__/books/open-library.test.js.map +1 -0
- package/dist/__tests__/news/guardian.test.d.ts +9 -0
- package/dist/__tests__/news/guardian.test.d.ts.map +1 -0
- package/dist/__tests__/news/guardian.test.js +224 -0
- package/dist/__tests__/news/guardian.test.js.map +1 -0
- package/dist/__tests__/news/nytimes.test.d.ts +9 -0
- package/dist/__tests__/news/nytimes.test.d.ts.map +1 -0
- package/dist/__tests__/news/nytimes.test.js +271 -0
- package/dist/__tests__/news/nytimes.test.js.map +1 -0
- package/dist/__tests__/news/site-search-source.test.d.ts +9 -0
- package/dist/__tests__/news/site-search-source.test.d.ts.map +1 -0
- package/dist/__tests__/news/site-search-source.test.js +342 -0
- package/dist/__tests__/news/site-search-source.test.js.map +1 -0
- package/dist/__tests__/obituary/find-a-grave.test.d.ts +8 -0
- package/dist/__tests__/obituary/find-a-grave.test.d.ts.map +1 -0
- package/dist/__tests__/obituary/find-a-grave.test.js +238 -0
- package/dist/__tests__/obituary/find-a-grave.test.js.map +1 -0
- package/dist/__tests__/shared/duckduckgo-search.test.d.ts +9 -0
- package/dist/__tests__/shared/duckduckgo-search.test.d.ts.map +1 -0
- package/dist/__tests__/shared/duckduckgo-search.test.js +218 -0
- package/dist/__tests__/shared/duckduckgo-search.test.js.map +1 -0
- package/dist/__tests__/shared/fetch-page.test.d.ts +9 -0
- package/dist/__tests__/shared/fetch-page.test.d.ts.map +1 -0
- package/dist/__tests__/shared/fetch-page.test.js +281 -0
- package/dist/__tests__/shared/fetch-page.test.js.map +1 -0
- package/dist/__tests__/shared/html-utils.test.d.ts +2 -0
- package/dist/__tests__/shared/html-utils.test.d.ts.map +1 -0
- package/dist/__tests__/shared/html-utils.test.js +169 -0
- package/dist/__tests__/shared/html-utils.test.js.map +1 -0
- package/dist/__tests__/shared/readability-extract.test.d.ts +2 -0
- package/dist/__tests__/shared/readability-extract.test.d.ts.map +1 -0
- package/dist/__tests__/shared/readability-extract.test.js +107 -0
- package/dist/__tests__/shared/readability-extract.test.js.map +1 -0
- package/dist/__tests__/shared/sanitize-text.test.d.ts +2 -0
- package/dist/__tests__/shared/sanitize-text.test.d.ts.map +1 -0
- package/dist/__tests__/shared/sanitize-text.test.js +77 -0
- package/dist/__tests__/shared/sanitize-text.test.js.map +1 -0
- package/dist/__tests__/shared/search-utils.test.d.ts +2 -0
- package/dist/__tests__/shared/search-utils.test.d.ts.map +1 -0
- package/dist/__tests__/shared/search-utils.test.js +26 -0
- package/dist/__tests__/shared/search-utils.test.js.map +1 -0
- package/dist/__tests__/structured/wikidata.test.d.ts +9 -0
- package/dist/__tests__/structured/wikidata.test.d.ts.map +1 -0
- package/dist/__tests__/structured/wikidata.test.js +509 -0
- package/dist/__tests__/structured/wikidata.test.js.map +1 -0
- package/dist/__tests__/structured/wikipedia.test.d.ts +9 -0
- package/dist/__tests__/structured/wikipedia.test.d.ts.map +1 -0
- package/dist/__tests__/structured/wikipedia.test.js +643 -0
- package/dist/__tests__/structured/wikipedia.test.js.map +1 -0
- package/dist/__tests__/web-search/base.test.d.ts +9 -0
- package/dist/__tests__/web-search/base.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/base.test.js +622 -0
- package/dist/__tests__/web-search/base.test.js.map +1 -0
- package/dist/__tests__/web-search/bing.test.d.ts +10 -0
- package/dist/__tests__/web-search/bing.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/bing.test.js +277 -0
- package/dist/__tests__/web-search/bing.test.js.map +1 -0
- package/dist/__tests__/web-search/brave.test.d.ts +10 -0
- package/dist/__tests__/web-search/brave.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/brave.test.js +264 -0
- package/dist/__tests__/web-search/brave.test.js.map +1 -0
- package/dist/__tests__/web-search/duckduckgo.test.d.ts +10 -0
- package/dist/__tests__/web-search/duckduckgo.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/duckduckgo.test.js +107 -0
- package/dist/__tests__/web-search/duckduckgo.test.js.map +1 -0
- package/dist/__tests__/web-search/google.test.d.ts +9 -0
- package/dist/__tests__/web-search/google.test.d.ts.map +1 -0
- package/dist/__tests__/web-search/google.test.js +189 -0
- package/dist/__tests__/web-search/google.test.js.map +1 -0
- package/dist/archives/chronicling-america.d.ts +33 -0
- package/dist/archives/chronicling-america.d.ts.map +1 -0
- package/dist/archives/chronicling-america.js +85 -0
- package/dist/archives/chronicling-america.js.map +1 -0
- package/dist/archives/europeana.d.ts +37 -0
- package/dist/archives/europeana.d.ts.map +1 -0
- package/dist/archives/europeana.js +92 -0
- package/dist/archives/europeana.js.map +1 -0
- package/dist/archives/internet-archive.d.ts +32 -0
- package/dist/archives/internet-archive.d.ts.map +1 -0
- package/dist/archives/internet-archive.js +90 -0
- package/dist/archives/internet-archive.js.map +1 -0
- package/dist/archives/trove.d.ts +37 -0
- package/dist/archives/trove.d.ts.map +1 -0
- package/dist/archives/trove.js +97 -0
- package/dist/archives/trove.js.map +1 -0
- package/dist/books/google-books.d.ts +48 -0
- package/dist/books/google-books.d.ts.map +1 -0
- package/dist/books/google-books.js +111 -0
- package/dist/books/google-books.js.map +1 -0
- package/dist/books/open-library.d.ts +44 -0
- package/dist/books/open-library.d.ts.map +1 -0
- package/dist/books/open-library.js +103 -0
- package/dist/books/open-library.js.map +1 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35 -0
- package/dist/index.js.map +1 -0
- package/dist/news/guardian.d.ts +51 -0
- package/dist/news/guardian.d.ts.map +1 -0
- package/dist/news/guardian.js +131 -0
- package/dist/news/guardian.js.map +1 -0
- package/dist/news/nytimes.d.ts +27 -0
- package/dist/news/nytimes.d.ts.map +1 -0
- package/dist/news/nytimes.js +104 -0
- package/dist/news/nytimes.js.map +1 -0
- package/dist/news/site-search-source.d.ts +89 -0
- package/dist/news/site-search-source.d.ts.map +1 -0
- package/dist/news/site-search-source.js +182 -0
- package/dist/news/site-search-source.js.map +1 -0
- package/dist/news/sources.d.ts +52 -0
- package/dist/news/sources.d.ts.map +1 -0
- package/dist/news/sources.js +276 -0
- package/dist/news/sources.js.map +1 -0
- package/dist/obituary/find-a-grave.d.ts +43 -0
- package/dist/obituary/find-a-grave.d.ts.map +1 -0
- package/dist/obituary/find-a-grave.js +173 -0
- package/dist/obituary/find-a-grave.js.map +1 -0
- package/dist/shared/duckduckgo-search.d.ts +86 -0
- package/dist/shared/duckduckgo-search.d.ts.map +1 -0
- package/dist/shared/duckduckgo-search.js +218 -0
- package/dist/shared/duckduckgo-search.js.map +1 -0
- package/dist/shared/fetch-page.d.ts +50 -0
- package/dist/shared/fetch-page.d.ts.map +1 -0
- package/dist/shared/fetch-page.js +212 -0
- package/dist/shared/fetch-page.js.map +1 -0
- package/dist/shared/html-utils.d.ts +99 -0
- package/dist/shared/html-utils.d.ts.map +1 -0
- package/dist/shared/html-utils.js +246 -0
- package/dist/shared/html-utils.js.map +1 -0
- package/dist/shared/readability-extract.d.ts +33 -0
- package/dist/shared/readability-extract.d.ts.map +1 -0
- package/dist/shared/readability-extract.js +45 -0
- package/dist/shared/readability-extract.js.map +1 -0
- package/dist/shared/sanitize-text.d.ts +24 -0
- package/dist/shared/sanitize-text.d.ts.map +1 -0
- package/dist/shared/sanitize-text.js +49 -0
- package/dist/shared/sanitize-text.js.map +1 -0
- package/dist/shared/search-utils.d.ts +18 -0
- package/dist/shared/search-utils.d.ts.map +1 -0
- package/dist/shared/search-utils.js +20 -0
- package/dist/shared/search-utils.js.map +1 -0
- package/dist/structured/wikidata.d.ts +128 -0
- package/dist/structured/wikidata.d.ts.map +1 -0
- package/dist/structured/wikidata.js +361 -0
- package/dist/structured/wikidata.js.map +1 -0
- package/dist/structured/wikipedia.d.ts +184 -0
- package/dist/structured/wikipedia.d.ts.map +1 -0
- package/dist/structured/wikipedia.js +275 -0
- package/dist/structured/wikipedia.js.map +1 -0
- package/dist/web-search/base.d.ts +128 -0
- package/dist/web-search/base.d.ts.map +1 -0
- package/dist/web-search/base.js +251 -0
- package/dist/web-search/base.js.map +1 -0
- package/dist/web-search/bing.d.ts +21 -0
- package/dist/web-search/bing.d.ts.map +1 -0
- package/dist/web-search/bing.js +53 -0
- package/dist/web-search/bing.js.map +1 -0
- package/dist/web-search/brave.d.ts +21 -0
- package/dist/web-search/brave.d.ts.map +1 -0
- package/dist/web-search/brave.js +56 -0
- package/dist/web-search/brave.js.map +1 -0
- package/dist/web-search/duckduckgo.d.ts +15 -0
- package/dist/web-search/duckduckgo.d.ts.map +1 -0
- package/dist/web-search/duckduckgo.js +21 -0
- package/dist/web-search/duckduckgo.js.map +1 -0
- package/dist/web-search/google.d.ts +24 -0
- package/dist/web-search/google.d.ts.map +1 -0
- package/dist/web-search/google.js +48 -0
- package/dist/web-search/google.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { htmlToText, decodeHtmlEntities, removeScriptTags, removeStyleTags, stripHtmlTags, cleanHtmlEntities, looksLikeCode, stripCodeFromText, htmlToTextClean, } from "../../shared/html-utils.js";
|
|
3
|
+
describe("decodeHtmlEntities", () => {
|
|
4
|
+
it("decodes named entities", () => {
|
|
5
|
+
expect(decodeHtmlEntities("& < > "")).toBe('& < > "');
|
|
6
|
+
});
|
|
7
|
+
it("decodes decimal numeric entities", () => {
|
|
8
|
+
expect(decodeHtmlEntities("& <")).toBe("& <");
|
|
9
|
+
});
|
|
10
|
+
it("decodes hexadecimal numeric entities", () => {
|
|
11
|
+
expect(decodeHtmlEntities("& <")).toBe("& <");
|
|
12
|
+
});
|
|
13
|
+
it("passes through plain text unchanged", () => {
|
|
14
|
+
expect(decodeHtmlEntities("Hello world")).toBe("Hello world");
|
|
15
|
+
});
|
|
16
|
+
it("handles mixed entities and text", () => {
|
|
17
|
+
expect(decodeHtmlEntities("Tom & Jerry — a classic")).toBe("Tom & Jerry \u2014 a classic");
|
|
18
|
+
});
|
|
19
|
+
});
|
|
20
|
+
describe("removeScriptTags", () => {
|
|
21
|
+
it("removes script tags and their content", () => {
|
|
22
|
+
const html = '<p>Hello</p><script>alert("evil")</script><p>World</p>';
|
|
23
|
+
expect(removeScriptTags(html)).toBe("<p>Hello</p><p>World</p>");
|
|
24
|
+
});
|
|
25
|
+
it("handles multiple script tags", () => {
|
|
26
|
+
const html = "<p>A</p><script>x()</script><p>B</p><script>y()</script><p>C</p>";
|
|
27
|
+
expect(removeScriptTags(html)).toBe("<p>A</p><p>B</p><p>C</p>");
|
|
28
|
+
});
|
|
29
|
+
it("handles case-insensitive script tags", () => {
|
|
30
|
+
const html = "<p>Text</p><SCRIPT>code()</SCRIPT><p>More</p>";
|
|
31
|
+
expect(removeScriptTags(html)).toBe("<p>Text</p><p>More</p>");
|
|
32
|
+
});
|
|
33
|
+
it("handles script tags with attributes", () => {
|
|
34
|
+
const html = '<p>Text</p><script type="text/javascript">code()</script><p>More</p>';
|
|
35
|
+
expect(removeScriptTags(html)).toBe("<p>Text</p><p>More</p>");
|
|
36
|
+
});
|
|
37
|
+
it("returns text unchanged when no script tags present", () => {
|
|
38
|
+
const html = "<p>No scripts here</p>";
|
|
39
|
+
expect(removeScriptTags(html)).toBe("<p>No scripts here</p>");
|
|
40
|
+
});
|
|
41
|
+
it("handles malformed script tag without closing bracket", () => {
|
|
42
|
+
const html = "<p>Before</p><script malformed";
|
|
43
|
+
expect(removeScriptTags(html)).toBe("<p>Before</p>");
|
|
44
|
+
});
|
|
45
|
+
});
|
|
46
|
+
describe("removeStyleTags", () => {
|
|
47
|
+
it("removes style tags and their content", () => {
|
|
48
|
+
const html = "<p>Hello</p><style>body { color: red; }</style><p>World</p>";
|
|
49
|
+
expect(removeStyleTags(html)).toBe("<p>Hello</p><p>World</p>");
|
|
50
|
+
});
|
|
51
|
+
it("handles multiple style tags", () => {
|
|
52
|
+
const html = "<style>.a{}</style><p>Text</p><style>.b{}</style><p>More</p>";
|
|
53
|
+
expect(removeStyleTags(html)).toBe("<p>Text</p><p>More</p>");
|
|
54
|
+
});
|
|
55
|
+
it("returns text unchanged when no style tags present", () => {
|
|
56
|
+
const html = "<p>No styles here</p>";
|
|
57
|
+
expect(removeStyleTags(html)).toBe("<p>No styles here</p>");
|
|
58
|
+
});
|
|
59
|
+
});
|
|
60
|
+
describe("stripHtmlTags", () => {
|
|
61
|
+
it("strips all HTML tags, replacing with spaces", () => {
|
|
62
|
+
expect(stripHtmlTags("<p>Hello</p> <b>World</b>")).toBe(" Hello World ");
|
|
63
|
+
});
|
|
64
|
+
it("handles self-closing tags", () => {
|
|
65
|
+
expect(stripHtmlTags("Line 1<br/>Line 2")).toBe("Line 1 Line 2");
|
|
66
|
+
});
|
|
67
|
+
it("handles tags with attributes", () => {
|
|
68
|
+
expect(stripHtmlTags('<a href="http://example.com">Link</a>')).toBe(" Link ");
|
|
69
|
+
});
|
|
70
|
+
it("returns plain text unchanged", () => {
|
|
71
|
+
expect(stripHtmlTags("No tags here")).toBe("No tags here");
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
describe("htmlToText", () => {
|
|
75
|
+
it("converts HTML to clean plain text", () => {
|
|
76
|
+
const html = "<p>Hello</p><p>World</p>";
|
|
77
|
+
const result = htmlToText(html);
|
|
78
|
+
expect(result).toBe("Hello World");
|
|
79
|
+
});
|
|
80
|
+
it("strips script tags before processing", () => {
|
|
81
|
+
const html = '<p>Text</p><script>alert("xss")</script><p>More text</p>';
|
|
82
|
+
const result = htmlToText(html);
|
|
83
|
+
expect(result).toBe("Text More text");
|
|
84
|
+
});
|
|
85
|
+
it("strips style tags before processing", () => {
|
|
86
|
+
const html = "<style>.red { color: red; }</style><p>Visible text</p>";
|
|
87
|
+
const result = htmlToText(html);
|
|
88
|
+
expect(result).toBe("Visible text");
|
|
89
|
+
});
|
|
90
|
+
it("decodes HTML entities", () => {
|
|
91
|
+
const html = "<p>Tom & Jerry — a classic</p>";
|
|
92
|
+
const result = htmlToText(html);
|
|
93
|
+
expect(result).toContain("Tom & Jerry");
|
|
94
|
+
expect(result).toContain("\u2014");
|
|
95
|
+
});
|
|
96
|
+
it("normalizes whitespace", () => {
|
|
97
|
+
const html = "<p> Too many spaces </p>";
|
|
98
|
+
const result = htmlToText(html);
|
|
99
|
+
expect(result).toBe("Too many spaces");
|
|
100
|
+
});
|
|
101
|
+
it("handles nested tags", () => {
|
|
102
|
+
const html = "<div><p><strong>Bold</strong> and <em>italic</em></p></div>";
|
|
103
|
+
const result = htmlToText(html);
|
|
104
|
+
expect(result).toBe("Bold and italic");
|
|
105
|
+
});
|
|
106
|
+
it("handles malformed HTML gracefully", () => {
|
|
107
|
+
const html = "<p>Unclosed paragraph<div>Another<p>Third";
|
|
108
|
+
const result = htmlToText(html);
|
|
109
|
+
expect(result).toContain("Unclosed paragraph");
|
|
110
|
+
expect(result).toContain("Another");
|
|
111
|
+
expect(result).toContain("Third");
|
|
112
|
+
});
|
|
113
|
+
it("handles empty input", () => {
|
|
114
|
+
expect(htmlToText("")).toBe("");
|
|
115
|
+
});
|
|
116
|
+
it("handles input with only whitespace", () => {
|
|
117
|
+
expect(htmlToText(" \n\t ")).toBe("");
|
|
118
|
+
});
|
|
119
|
+
});
|
|
120
|
+
describe("cleanHtmlEntities", () => {
|
|
121
|
+
it("decodes entities without removing tags", () => {
|
|
122
|
+
const html = "<p>Tom & Jerry</p>";
|
|
123
|
+
const result = cleanHtmlEntities(html);
|
|
124
|
+
expect(result).toBe("<p>Tom & Jerry</p>");
|
|
125
|
+
});
|
|
126
|
+
it("normalizes whitespace", () => {
|
|
127
|
+
const html = "<p> spaced out </p>";
|
|
128
|
+
expect(cleanHtmlEntities(html)).toBe("<p> spaced out </p>");
|
|
129
|
+
});
|
|
130
|
+
});
|
|
131
|
+
describe("looksLikeCode", () => {
|
|
132
|
+
it("returns true for JavaScript-like code", () => {
|
|
133
|
+
const code = 'const x = 5; if (x > 3) { console.log("hello"); }';
|
|
134
|
+
expect(looksLikeCode(code)).toBe(true);
|
|
135
|
+
});
|
|
136
|
+
it("returns false for normal text", () => {
|
|
137
|
+
const text = "John was born in Kansas and grew up on a farm with his family.";
|
|
138
|
+
expect(looksLikeCode(text)).toBe(false);
|
|
139
|
+
});
|
|
140
|
+
it("returns false for empty text", () => {
|
|
141
|
+
expect(looksLikeCode("")).toBe(false);
|
|
142
|
+
});
|
|
143
|
+
it("returns false for very short text", () => {
|
|
144
|
+
expect(looksLikeCode("short")).toBe(false);
|
|
145
|
+
});
|
|
146
|
+
});
|
|
147
|
+
describe("stripCodeFromText", () => {
|
|
148
|
+
it("returns empty string for full code block", () => {
|
|
149
|
+
const code = "function hello() { const x = 5; return x; } console.log(hello());";
|
|
150
|
+
expect(stripCodeFromText(code)).toBe("");
|
|
151
|
+
});
|
|
152
|
+
it("returns empty string for empty input", () => {
|
|
153
|
+
expect(stripCodeFromText("")).toBe("");
|
|
154
|
+
});
|
|
155
|
+
it("preserves natural language text", () => {
|
|
156
|
+
const text = "John was born in a small town. He went to university in 1985. He married Jane in 1990.";
|
|
157
|
+
const result = stripCodeFromText(text);
|
|
158
|
+
expect(result).toContain("John was born");
|
|
159
|
+
expect(result).toContain("university in 1985");
|
|
160
|
+
});
|
|
161
|
+
});
|
|
162
|
+
describe("htmlToTextClean", () => {
|
|
163
|
+
it("combines HTML cleaning with code stripping", () => {
|
|
164
|
+
const html = "<p>John was born in Kansas and grew up on a farm with his family and friends.</p>";
|
|
165
|
+
const result = htmlToTextClean(html);
|
|
166
|
+
expect(result).toContain("John was born in Kansas");
|
|
167
|
+
});
|
|
168
|
+
});
|
|
169
|
+
//# sourceMappingURL=html-utils.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-utils.test.js","sourceRoot":"","sources":["../../../src/__tests__/shared/html-utils.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAA;AAC7C,OAAO,EACL,UAAU,EACV,kBAAkB,EAClB,gBAAgB,EAChB,eAAe,EACf,aAAa,EACb,iBAAiB,EACjB,aAAa,EACb,iBAAiB,EACjB,eAAe,GAChB,MAAM,4BAA4B,CAAA;AAEnC,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;IAClC,EAAE,CAAC,wBAAwB,EAAE,GAAG,EAAE;QAChC,MAAM,CAAC,kBAAkB,CAAC,wBAAwB,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;IACtE,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,CAAC,kBAAkB,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACzD,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,CAAC,kBAAkB,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAA;IAC/D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,CAAC,kBAAkB,CAAC,mCAAmC,CAAC,CAAC,CAAC,IAAI,CAClE,8BAA8B,CAC/B,CAAA;IACH,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;IAChC,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,IAAI,GAAG,wDAAwD,CAAA;QACrE,MAAM,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAA;IACjE,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,IAAI,GAAG,kEAAkE,CAAA;QAC/E,MAAM,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAA;IACjE,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,IAAI,GAAG,+CAA+C,CAAA;QAC5D,MAAM,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAA;IAC/D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,IAAI,GAAG,sEAAsE,CAAA;QACnF,MAAM,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAA;IAC/D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,oDAAoD,EAAE,GAAG,EAAE;QAC5D,MAAM,IAAI,GAAG,wBAAwB,CAAA;QACrC,MAAM,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAA;IAC/D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,MAAM,IAAI,GAAG,gCAAgC,CAAA;QAC7C,MAAM,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAA;IACtD,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,IAAI,GAAG,6DAA6D,CAAA;QAC1E,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAA;IAChE,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,IAAI,GAAG,8DAA8D,CAAA;QAC3E,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAA;IAC9D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,mDAAmD,EAAE,GAAG,EAAE;QAC3D,MAAM,IAAI,GAAG,uBAAuB,CAAA;QACpC,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAA;IAC7D,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,EAAE,CAAC,6CAA6C,EAAE,GAAG,EAAE;QACrD,MAAM,CAAC,aAAa,CAAC,2BAA2B,CAAC,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAA;IAC5E,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,CAAC,aAAa,CAAC,mBAAmB,CAAC,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAA;IAClE,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,CAAC,aAAa,CAAC,uCAAuC,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;IAC/E,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,CAAC,aAAa,CAAC,cAAc,CAAC,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAA;IAC5D,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;IAC1B,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,0BAA0B,CAAA;QACvC,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAA;IACpC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,IAAI,GAAG,0DAA0D,CAAA;QACvE,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAA;IACvC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,IAAI,GAAG,wDAAwD,CAAA;QACrE,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAA;IACrC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,uBAAuB,EAAE,GAAG,EAAE;QAC/B,MAAM,IAAI,GAAG,0CAA0C,CAAA;QACvD,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,aAAa,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAA;IACpC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,uBAAuB,EAAE,GAAG,EAAE;QAC/B,MAAM,IAAI,GAAG,iCAAiC,CAAA;QAC9C,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAA;IACxC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qBAAqB,EAAE,GAAG,EAAE;QAC7B,MAAM,IAAI,GAAG,6DAA6D,CAAA;QAC1E,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAA;IACxC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,2CAA2C,CAAA;QACxD,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,CAAA;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAA;QAC9C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,SAAS,CAAC,CAAA;QACnC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAA;IACnC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qBAAqB,EAAE,GAAG,EAAE;QAC7B,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACjC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,CAAC,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;IACjC,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,IAAI,GAAG,wBAAwB,CAAA;QACrC,MAAM,MAAM,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAA;QACtC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAA;IAC3C,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,uBAAuB,EAAE,GAAG,EAAE;QAC/B,MAAM,IAAI,GAAG,yBAAyB,CAAA;QACtC,MAAM,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAA;IAC7D,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,IAAI,GAAG,mDAAmD,CAAA;QAChE,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACxC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;QACvC,MAAM,IAAI,GAAG,gEAAgE,CAAA;QAC7E,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACzC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,CAAC,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACvC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IAC5C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;IACjC,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,IAAI,GAAG,mEAAmE,CAAA;QAChF,MAAM,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,CAAC,iBAAiB,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACxC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,IAAI,GACR,wFAAwF,CAAA;QAC1F,MAAM,MAAM,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAA;QACtC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAA;QACzC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAA;IAChD,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,IAAI,GAAG,mFAAmF,CAAA;QAChG,MAAM,MAAM,GAAG,eAAe,CAAC,IAAI,CAAC,CAAA;QACpC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,yBAAyB,CAAC,CAAA;IACrD,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"readability-extract.test.d.ts","sourceRoot":"","sources":["../../../src/__tests__/shared/readability-extract.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { extractArticleContent } from "../../shared/readability-extract.js";
|
|
3
|
+
describe("extractArticleContent", () => {
|
|
4
|
+
it("extracts article content from a full HTML page", () => {
|
|
5
|
+
const html = `
|
|
6
|
+
<!DOCTYPE html>
|
|
7
|
+
<html>
|
|
8
|
+
<head><title>Test Article</title></head>
|
|
9
|
+
<body>
|
|
10
|
+
<nav><a href="/">Home</a> | <a href="/about">About</a></nav>
|
|
11
|
+
<article>
|
|
12
|
+
<h1>The Life and Times of John Doe</h1>
|
|
13
|
+
<p>John Doe was born in a small town in Kansas. He grew up surrounded
|
|
14
|
+
by wheat fields and open skies. His father was a farmer and his mother
|
|
15
|
+
was a schoolteacher. From an early age, John showed a remarkable talent
|
|
16
|
+
for storytelling, which would later define his career in journalism.</p>
|
|
17
|
+
<p>After graduating from the University of Kansas in 1985, John moved
|
|
18
|
+
to New York City to pursue his dream of becoming a writer. He worked at
|
|
19
|
+
several newspapers before landing a position at the New York Times.</p>
|
|
20
|
+
<p>John married his college sweetheart, Jane Smith, in 1990. They had
|
|
21
|
+
three children together and lived in Brooklyn for over two decades.</p>
|
|
22
|
+
</article>
|
|
23
|
+
<footer>Copyright 2024</footer>
|
|
24
|
+
</body>
|
|
25
|
+
</html>
|
|
26
|
+
`;
|
|
27
|
+
const result = extractArticleContent(html);
|
|
28
|
+
expect(result).not.toBeNull();
|
|
29
|
+
expect(result.text).toContain("John Doe was born");
|
|
30
|
+
expect(result.text).toContain("University of Kansas");
|
|
31
|
+
expect(result.title).toBe("The Life and Times of John Doe");
|
|
32
|
+
});
|
|
33
|
+
it("returns null for non-article HTML (too short)", () => {
|
|
34
|
+
const html = `<html><body><p>Short.</p></body></html>`;
|
|
35
|
+
const result = extractArticleContent(html);
|
|
36
|
+
expect(result).toBeNull();
|
|
37
|
+
});
|
|
38
|
+
it("returns null for empty input", () => {
|
|
39
|
+
const result = extractArticleContent("");
|
|
40
|
+
expect(result).toBeNull();
|
|
41
|
+
});
|
|
42
|
+
it("returns null for HTML with only navigation elements", () => {
|
|
43
|
+
const html = `
|
|
44
|
+
<html><body>
|
|
45
|
+
<nav><ul><li><a href="/">Home</a></li><li><a href="/about">About</a></li></ul></nav>
|
|
46
|
+
<footer>Copyright 2024</footer>
|
|
47
|
+
</body></html>
|
|
48
|
+
`;
|
|
49
|
+
const result = extractArticleContent(html);
|
|
50
|
+
expect(result).toBeNull();
|
|
51
|
+
});
|
|
52
|
+
it("extracts author from byline when present", () => {
|
|
53
|
+
const html = `
|
|
54
|
+
<!DOCTYPE html>
|
|
55
|
+
<html>
|
|
56
|
+
<head><title>News Article</title></head>
|
|
57
|
+
<body>
|
|
58
|
+
<article>
|
|
59
|
+
<h1>Breaking News Story About Important Events</h1>
|
|
60
|
+
<p class="byline">By Jane Reporter</p>
|
|
61
|
+
<p>This is a detailed news article about very important events that
|
|
62
|
+
happened recently. The events took place in multiple cities across
|
|
63
|
+
the country and affected thousands of people. Officials responded
|
|
64
|
+
quickly to the situation and provided updates throughout the day.
|
|
65
|
+
Additional details emerged as the investigation continued.</p>
|
|
66
|
+
<p>Witnesses described the scene as chaotic but said emergency
|
|
67
|
+
responders arrived within minutes. The local government issued a
|
|
68
|
+
statement praising the response effort and promising a thorough review.</p>
|
|
69
|
+
</article>
|
|
70
|
+
</body>
|
|
71
|
+
</html>
|
|
72
|
+
`;
|
|
73
|
+
const result = extractArticleContent(html);
|
|
74
|
+
expect(result).not.toBeNull();
|
|
75
|
+
expect(result.text).toContain("important events");
|
|
76
|
+
// Readability may or may not extract the byline depending on heuristics,
|
|
77
|
+
// but the article content should always be extracted
|
|
78
|
+
if (result.author) {
|
|
79
|
+
expect(result.author).toContain("Jane Reporter");
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
it("uses url parameter for resolving relative links", () => {
|
|
83
|
+
const html = `
|
|
84
|
+
<!DOCTYPE html>
|
|
85
|
+
<html>
|
|
86
|
+
<head><title>Site Article</title></head>
|
|
87
|
+
<body>
|
|
88
|
+
<article>
|
|
89
|
+
<h1>An Article on Example.com About Various Topics</h1>
|
|
90
|
+
<p>This article discusses many interesting topics at great length.
|
|
91
|
+
We explore the nuances and implications of recent developments in
|
|
92
|
+
the field. Our team of researchers spent months investigating these
|
|
93
|
+
claims before publishing their findings in this comprehensive report.</p>
|
|
94
|
+
<p>The implications of these findings are far-reaching and could
|
|
95
|
+
affect policy decisions for years to come. Experts from multiple
|
|
96
|
+
institutions have weighed in on the significance of this research.</p>
|
|
97
|
+
</article>
|
|
98
|
+
</body>
|
|
99
|
+
</html>
|
|
100
|
+
`;
|
|
101
|
+
// Should not throw when url is provided
|
|
102
|
+
const result = extractArticleContent(html, "https://example.com/article");
|
|
103
|
+
expect(result).not.toBeNull();
|
|
104
|
+
expect(result.text).toContain("interesting topics");
|
|
105
|
+
});
|
|
106
|
+
});
|
|
107
|
+
//# sourceMappingURL=readability-extract.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"readability-extract.test.js","sourceRoot":"","sources":["../../../src/__tests__/shared/readability-extract.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAA;AAC7C,OAAO,EAAE,qBAAqB,EAAE,MAAM,qCAAqC,CAAA;AAE3E,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;IACrC,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,IAAI,GAAG;;;;;;;;;;;;;;;;;;;;;KAqBZ,CAAA;QACD,MAAM,MAAM,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAA;QAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAA;QAC7B,MAAM,CAAC,MAAO,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,mBAAmB,CAAC,CAAA;QACnD,MAAM,CAAC,MAAO,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAA;QACtD,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,gCAAgC,CAAC,CAAA;IAC9D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,IAAI,GAAG,yCAAyC,CAAA;QACtD,MAAM,MAAM,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAA;QAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,CAAA;IAC3B,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,MAAM,GAAG,qBAAqB,CAAC,EAAE,CAAC,CAAA;QACxC,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,CAAA;IAC3B,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qDAAqD,EAAE,GAAG,EAAE;QAC7D,MAAM,IAAI,GAAG;;;;;KAKZ,CAAA;QACD,MAAM,MAAM,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAA;QAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,CAAA;IAC3B,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,IAAI,GAAG;;;;;;;;;;;;;;;;;;;KAmBZ,CAAA;QACD,MAAM,MAAM,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAA;QAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAA;QAC7B,MAAM,CAAC,MAAO,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAA;QAClD,yEAAyE;QACzE,qDAAqD;QACrD,IAAI,MAAO,CAAC,MAAM,EAAE,CAAC;YACnB,MAAM,CAAC,MAAO,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAA;QACnD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,iDAAiD,EAAE,GAAG,EAAE;QACzD,MAAM,IAAI,GAAG;;;;;;;;;;;;;;;;;KAiBZ,CAAA;QACD,wCAAwC;QACxC,MAAM,MAAM,GAAG,qBAAqB,CAAC,IAAI,EAAE,6BAA6B,CAAC,CAAA;QACzE,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAA;QAC7B,MAAM,CAAC,MAAO,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAA;IACtD,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sanitize-text.test.d.ts","sourceRoot":"","sources":["../../../src/__tests__/shared/sanitize-text.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { sanitizeSourceText } from "../../shared/sanitize-text.js";
|
|
3
|
+
describe("sanitizeSourceText", () => {
|
|
4
|
+
it("removes Wikipedia citation markers", () => {
|
|
5
|
+
const text = "He was born in 1920[1] and died in 1990[2].";
|
|
6
|
+
const result = sanitizeSourceText(text);
|
|
7
|
+
expect(result).toBe("He was born in 1920 and died in 1990.");
|
|
8
|
+
});
|
|
9
|
+
it("removes citation markers with spaces inside brackets", () => {
|
|
10
|
+
const text = "He served in the war[ 3 ] and returned home[ 4 ].";
|
|
11
|
+
const result = sanitizeSourceText(text);
|
|
12
|
+
expect(result).toBe("He served in the war and returned home.");
|
|
13
|
+
});
|
|
14
|
+
it("removes multiple adjacent citation markers", () => {
|
|
15
|
+
const text = "According to sources[2][3][4], this is true.";
|
|
16
|
+
const result = sanitizeSourceText(text);
|
|
17
|
+
expect(result).toBe("According to sources, this is true.");
|
|
18
|
+
});
|
|
19
|
+
it("removes [edit] tags", () => {
|
|
20
|
+
const text = "Early life [edit]\nHe was born in Kansas.";
|
|
21
|
+
const result = sanitizeSourceText(text);
|
|
22
|
+
expect(result).toContain("Early life");
|
|
23
|
+
expect(result).not.toContain("[edit]");
|
|
24
|
+
expect(result).toContain("He was born in Kansas.");
|
|
25
|
+
});
|
|
26
|
+
it("removes [citation needed] tags", () => {
|
|
27
|
+
const text = "He was reportedly the tallest actor[citation needed] of his era.";
|
|
28
|
+
const result = sanitizeSourceText(text);
|
|
29
|
+
expect(result).toBe("He was reportedly the tallest actor of his era.");
|
|
30
|
+
});
|
|
31
|
+
it("removes footnote reference lines starting with ^", () => {
|
|
32
|
+
const text = "Main text here.\n^ Footnote reference one.\n^ Footnote reference two.\nMore content.";
|
|
33
|
+
const result = sanitizeSourceText(text);
|
|
34
|
+
expect(result).toContain("Main text here.");
|
|
35
|
+
expect(result).toContain("More content.");
|
|
36
|
+
expect(result).not.toContain("Footnote reference");
|
|
37
|
+
});
|
|
38
|
+
it("removes navigation-like pipe-separated patterns", () => {
|
|
39
|
+
const text = "Real content here.\nNews | Sports | Weather | Entertainment\nMore content.";
|
|
40
|
+
const result = sanitizeSourceText(text);
|
|
41
|
+
expect(result).toContain("Real content here.");
|
|
42
|
+
expect(result).toContain("More content.");
|
|
43
|
+
expect(result).not.toContain("News | Sports");
|
|
44
|
+
});
|
|
45
|
+
it("removes common boilerplate phrases", () => {
|
|
46
|
+
const text = "Important content.\nSign in to continue reading.\nPrivacy Policy and Terms of Service apply.\nMore important content.";
|
|
47
|
+
const result = sanitizeSourceText(text);
|
|
48
|
+
expect(result).toContain("Important content.");
|
|
49
|
+
expect(result).toContain("More important content.");
|
|
50
|
+
expect(result).not.toContain("Sign in");
|
|
51
|
+
expect(result).not.toContain("Privacy Policy");
|
|
52
|
+
});
|
|
53
|
+
it("collapses excess blank lines", () => {
|
|
54
|
+
const text = "Line one.\n\n\n\n\nLine two.";
|
|
55
|
+
const result = sanitizeSourceText(text);
|
|
56
|
+
// After collapsing 3+ newlines to 2, the empty line between them
|
|
57
|
+
// is removed by the final filter step (removes zero-length lines)
|
|
58
|
+
expect(result).toBe("Line one.\nLine two.");
|
|
59
|
+
});
|
|
60
|
+
it("trims whitespace from lines", () => {
|
|
61
|
+
const text = " Line with spaces \n Another line ";
|
|
62
|
+
const result = sanitizeSourceText(text);
|
|
63
|
+
expect(result).toBe("Line with spaces\nAnother line");
|
|
64
|
+
});
|
|
65
|
+
it("preserves normal text", () => {
|
|
66
|
+
const text = "John Doe was born in 1920 in Kansas.\nHe married Jane in 1945.\nThey had three children.";
|
|
67
|
+
const result = sanitizeSourceText(text);
|
|
68
|
+
expect(result).toBe(text);
|
|
69
|
+
});
|
|
70
|
+
it("handles empty input", () => {
|
|
71
|
+
expect(sanitizeSourceText("")).toBe("");
|
|
72
|
+
});
|
|
73
|
+
it("handles whitespace-only input", () => {
|
|
74
|
+
expect(sanitizeSourceText(" \n\n ")).toBe("");
|
|
75
|
+
});
|
|
76
|
+
});
|
|
77
|
+
//# sourceMappingURL=sanitize-text.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sanitize-text.test.js","sourceRoot":"","sources":["../../../src/__tests__/shared/sanitize-text.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAA;AAC7C,OAAO,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAA;AAElE,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;IAClC,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,6CAA6C,CAAA;QAC1D,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,uCAAuC,CAAC,CAAA;IAC9D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,MAAM,IAAI,GAAG,mDAAmD,CAAA;QAChE,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,yCAAyC,CAAC,CAAA;IAChE,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,IAAI,GAAG,8CAA8C,CAAA;QAC3D,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,qCAAqC,CAAC,CAAA;IAC5D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qBAAqB,EAAE,GAAG,EAAE;QAC7B,MAAM,IAAI,GAAG,2CAA2C,CAAA;QACxD,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAA;QACtC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAA;QACtC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,wBAAwB,CAAC,CAAA;IACpD,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,IAAI,GAAG,kEAAkE,CAAA;QAC/E,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,iDAAiD,CAAC,CAAA;IACxE,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,kDAAkD,EAAE,GAAG,EAAE;QAC1D,MAAM,IAAI,GACR,sFAAsF,CAAA;QACxF,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAA;QAC3C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAA;QACzC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAA;IACpD,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,iDAAiD,EAAE,GAAG,EAAE;QACzD,MAAM,IAAI,GAAG,4EAA4E,CAAA;QACzF,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAA;QAC9C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAA;QACzC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,eAAe,CAAC,CAAA;IAC/C,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GACR,uHAAuH,CAAA;QACzH,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAA;QAC9C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,yBAAyB,CAAC,CAAA;QACnD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAA;IAChD,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,IAAI,GAAG,8BAA8B,CAAA;QAC3C,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,iEAAiE;QACjE,kEAAkE;QAClE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAA;IAC7C,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,IAAI,GAAG,wCAAwC,CAAA;QACrD,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,gCAAgC,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,uBAAuB,EAAE,GAAG,EAAE;QAC/B,MAAM,IAAI,GACR,0FAA0F,CAAA;QAC5F,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC3B,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qBAAqB,EAAE,GAAG,EAAE;QAC7B,MAAM,CAAC,kBAAkB,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACzC,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;QACvC,MAAM,CAAC,kBAAkB,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACnD,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search-utils.test.d.ts","sourceRoot":"","sources":["../../../src/__tests__/shared/search-utils.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { splitSearchWords } from "../../shared/search-utils.js";
|
|
3
|
+
describe("splitSearchWords", () => {
|
|
4
|
+
it("splits a query on spaces", () => {
|
|
5
|
+
expect(splitSearchWords("john doe")).toEqual(["john", "doe"]);
|
|
6
|
+
});
|
|
7
|
+
it("handles multiple consecutive spaces", () => {
|
|
8
|
+
expect(splitSearchWords("john doe smith")).toEqual(["john", "doe", "smith"]);
|
|
9
|
+
});
|
|
10
|
+
it("returns empty array for empty string", () => {
|
|
11
|
+
expect(splitSearchWords("")).toEqual([]);
|
|
12
|
+
});
|
|
13
|
+
it("returns empty array for whitespace-only string", () => {
|
|
14
|
+
expect(splitSearchWords(" ")).toEqual([]);
|
|
15
|
+
});
|
|
16
|
+
it("trims leading and trailing whitespace", () => {
|
|
17
|
+
expect(splitSearchWords(" hello world ")).toEqual(["hello", "world"]);
|
|
18
|
+
});
|
|
19
|
+
it("returns single-element array for single word", () => {
|
|
20
|
+
expect(splitSearchWords("hello")).toEqual(["hello"]);
|
|
21
|
+
});
|
|
22
|
+
it("handles tabs and mixed whitespace", () => {
|
|
23
|
+
expect(splitSearchWords("hello\tworld\n test")).toEqual(["hello", "world", "test"]);
|
|
24
|
+
});
|
|
25
|
+
});
|
|
26
|
+
//# sourceMappingURL=search-utils.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search-utils.test.js","sourceRoot":"","sources":["../../../src/__tests__/shared/search-utils.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAA;AAE/D,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;IAChC,EAAE,CAAC,0BAA0B,EAAE,GAAG,EAAE;QAClC,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAA;IAC/D,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,CAAC,gBAAgB,CAAC,oBAAoB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC,CAAA;IAClF,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,CAAC,gBAAgB,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAA;IAC7C,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,CAAC,gBAAgB,CAAC,iBAAiB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAA;IACzE,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;QACtD,MAAM,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,CAAA;IACtD,CAAC,CAAC,CAAA;IAEF,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC,CAAA;IACrF,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the Wikidata SPARQL source.
|
|
3
|
+
*
|
|
4
|
+
* Mocks the global `fetch` function to avoid real API calls.
|
|
5
|
+
* Tests SPARQL query construction, response parsing, SPARQL escaping,
|
|
6
|
+
* retry logic, error handling, and the factory function.
|
|
7
|
+
*/
|
|
8
|
+
export {};
|
|
9
|
+
//# sourceMappingURL=wikidata.test.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"wikidata.test.d.ts","sourceRoot":"","sources":["../../../src/__tests__/structured/wikidata.test.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG"}
|