@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.circuitBreakers = exports.CircuitBreaker = void 0;
|
|
4
|
+
class CircuitBreaker {
|
|
5
|
+
constructor(options) {
|
|
6
|
+
this.failures = 0;
|
|
7
|
+
this.lastFailureTime = 0;
|
|
8
|
+
this.state = 'CLOSED';
|
|
9
|
+
this.options = options;
|
|
10
|
+
}
|
|
11
|
+
async execute(operation) {
|
|
12
|
+
if (this.state === 'OPEN') {
|
|
13
|
+
if (Date.now() - this.lastFailureTime < this.options.resetTimeout) {
|
|
14
|
+
throw new Error(`[CircuitBreaker:${this.options.name}] Circuit is OPEN - preventing request`);
|
|
15
|
+
}
|
|
16
|
+
else {
|
|
17
|
+
this.state = 'HALF_OPEN';
|
|
18
|
+
console.log(`🔄 [CircuitBreaker:${this.options.name}] Circuit moving to HALF_OPEN state`);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
try {
|
|
22
|
+
const result = await this.executeWithTimeout(operation);
|
|
23
|
+
this.onSuccess();
|
|
24
|
+
return result;
|
|
25
|
+
}
|
|
26
|
+
catch (error) {
|
|
27
|
+
this.onFailure();
|
|
28
|
+
throw error;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
async executeWithTimeout(operation) {
|
|
32
|
+
return new Promise((resolve, reject) => {
|
|
33
|
+
const timer = setTimeout(() => {
|
|
34
|
+
reject(new Error(`[CircuitBreaker:${this.options.name}] Operation timeout after ${this.options.timeout}ms`));
|
|
35
|
+
}, this.options.timeout);
|
|
36
|
+
operation()
|
|
37
|
+
.then(result => {
|
|
38
|
+
clearTimeout(timer);
|
|
39
|
+
resolve(result);
|
|
40
|
+
})
|
|
41
|
+
.catch(error => {
|
|
42
|
+
clearTimeout(timer);
|
|
43
|
+
reject(error);
|
|
44
|
+
});
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
onSuccess() {
|
|
48
|
+
this.failures = 0;
|
|
49
|
+
this.state = 'CLOSED';
|
|
50
|
+
}
|
|
51
|
+
onFailure() {
|
|
52
|
+
this.failures++;
|
|
53
|
+
this.lastFailureTime = Date.now();
|
|
54
|
+
if (this.failures >= this.options.failureThreshold) {
|
|
55
|
+
this.state = 'OPEN';
|
|
56
|
+
console.error(`❌ [CircuitBreaker:${this.options.name}] Circuit opened after ${this.failures} failures`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
getState() {
|
|
60
|
+
return {
|
|
61
|
+
state: this.state,
|
|
62
|
+
failures: this.failures,
|
|
63
|
+
lastFailureTime: this.lastFailureTime
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
exports.CircuitBreaker = CircuitBreaker;
|
|
68
|
+
// Pre-configured circuit breakers for common services
|
|
69
|
+
exports.circuitBreakers = {
|
|
70
|
+
rss: new CircuitBreaker({
|
|
71
|
+
name: 'RSS',
|
|
72
|
+
failureThreshold: 3,
|
|
73
|
+
timeout: 15000, // 15 seconds
|
|
74
|
+
resetTimeout: 30000 // 30 seconds
|
|
75
|
+
}),
|
|
76
|
+
scraping: new CircuitBreaker({
|
|
77
|
+
name: 'Scraping',
|
|
78
|
+
failureThreshold: 5,
|
|
79
|
+
timeout: 10000, // 10 seconds
|
|
80
|
+
resetTimeout: 30000 // 30 seconds
|
|
81
|
+
}),
|
|
82
|
+
scrapingTest: new CircuitBreaker({
|
|
83
|
+
name: 'ScrapingTest',
|
|
84
|
+
failureThreshold: 3,
|
|
85
|
+
timeout: 120000, // 120 seconds to accommodate large sites with many sitemaps
|
|
86
|
+
resetTimeout: 60000 // 1 minute
|
|
87
|
+
})
|
|
88
|
+
};
|
|
89
|
+
//# sourceMappingURL=circuit-breaker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"circuit-breaker.js","sourceRoot":"","sources":["../../lib/circuit-breaker.ts"],"names":[],"mappings":";;;AAOA,MAAa,cAAc;IAMzB,YAAY,OAA8B;QALlC,aAAQ,GAAG,CAAC,CAAC;QACb,oBAAe,GAAG,CAAC,CAAC;QACpB,UAAK,GAAoC,QAAQ,CAAC;QAIxD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED,KAAK,CAAC,OAAO,CAAI,SAA2B;QAC1C,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,EAAE,CAAC;YAC1B,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC;gBAClE,MAAM,IAAI,KAAK,CAAC,mBAAmB,IAAI,CAAC,OAAO,CAAC,IAAI,wCAAwC,CAAC,CAAC;YAChG,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC,KAAK,GAAG,WAAW,CAAC;gBACzB,OAAO,CAAC,GAAG,CAAC,sBAAsB,IAAI,CAAC,OAAO,CAAC,IAAI,qCAAqC,CAAC,CAAC;YAC5F,CAAC;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,kBAAkB,CAAC,SAAS,CAAC,CAAC;YACxD,IAAI,CAAC,SAAS,EAAE,CAAC;YACjB,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAI,CAAC,SAAS,EAAE,CAAC;YACjB,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,kBAAkB,CAAI,SAA2B;QAC7D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;gBAC5B,MAAM,CAAC,IAAI,KAAK,CAAC,mBAAmB,IAAI,CAAC,OAAO,CAAC,IAAI,6BAA6B,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC;YAC/G,CAAC,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAEzB,SAAS,EAAE;iBACR,IAAI,CAAC,MAAM,CAAC,EAAE;gBACb,YAAY,CAAC,KAAK,CAAC,CAAC;gBACpB,OAAO,CAAC,MAAM,CAAC,CAAC;YAClB,CAAC,CAAC;iBACD,KAAK,CAAC,KAAK,CAAC,EAAE;gBACb,YAAY,CAAC,KAAK,CAAC,CAAC;gBACpB,MAAM,CAAC,KAAK,CAAC,CAAC;YAChB,CAAC,CAAC,CAAC;QACP,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,SAAS;QACf,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;QAClB,IAAI,CAAC,KAAK,GAAG,QAAQ,CAAC;IACxB,CAAC;IAEO,SAAS;QACf,IAAI,CAAC,QAAQ,EAAE,CAAC;QAChB,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAElC,IAAI,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAE,CAAC;YACnD,IAAI,CAAC,KAAK,GAAG,MAAM,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,qBAAqB,IAAI,CAAC,OAAO,CAAC,IAAI,0BAA0B,IAAI,CAAC,QAAQ,WAAW,CAAC,CAAC;QAC1G,CAAC;IACH,CAAC;IAED,QAAQ;QACN,OAAO;YACL,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,eAAe,EAAE,IAAI,CAAC,eAAe;SACtC,CAAC;IACJ,CAAC;CACF;AAtED,wCAsEC;AAED,sDAAsD;AACzC,QAAA,eAAe,GAAG;IAC7B,GAAG,EAAE,IAAI,cAAc,CAAC;QACtB,IAAI,EAAE,KAAK;QACX,gBAAgB,EAAE,CAAC;QACnB,OAAO,EAAE,KAAK,EAAE,aAAa;QAC7B,YAAY,EAAE,KAAK,CAAC,aAAa;KAClC,CAAC;IAEF,QAAQ,EAAE,IAAI,cAAc,CAAC;QAC3B,IAAI,EAAE,UAAU;QAChB,gBAAgB,EAAE,CAAC;QACnB,OAAO,EAAE,KAAK,EAAE,aAAa;QAC7B,YAAY,EAAE,KAAK,CAAC,aAAa;KAClC,CAAC;IAEF,YAAY,EAAE,IAAI,cAAc,CAAC;QAC/B,IAAI,EAAE,cAAc;QACpB,gBAAgB,EAAE,CAAC;QACnB,OAAO,EAAE,MAAM,EAAE,4DAA4D;QAC7E,YAAY,EAAE,KAAK,CAAC,WAAW;KAChC,CAAC;CACH,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simplified content extractor for standalone app
|
|
3
|
+
* Uses Mozilla Readability for article extraction
|
|
4
|
+
*/
|
|
5
|
+
import { ExtractedContent } from './types';
|
|
6
|
+
export declare class ContentExtractor {
|
|
7
|
+
extractContent(url: string): Promise<ExtractedContent | null>;
|
|
8
|
+
/**
|
|
9
|
+
* Extract published time from document metadata
|
|
10
|
+
*/
|
|
11
|
+
private extractPublishedTime;
|
|
12
|
+
}
|
|
13
|
+
//# sourceMappingURL=content-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-extractor.d.ts","sourceRoot":"","sources":["../../lib/content-extractor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAE3C,qBAAa,gBAAgB;IACrB,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IA6CnE;;OAEG;IACH,OAAO,CAAC,oBAAoB;CAqB7B"}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Simplified content extractor for standalone app
|
|
4
|
+
* Uses Mozilla Readability for article extraction
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.ContentExtractor = void 0;
|
|
8
|
+
const readability_1 = require("@mozilla/readability");
|
|
9
|
+
const jsdom_1 = require("jsdom");
|
|
10
|
+
class ContentExtractor {
|
|
11
|
+
async extractContent(url) {
|
|
12
|
+
try {
|
|
13
|
+
const response = await fetch(url, {
|
|
14
|
+
headers: {
|
|
15
|
+
'User-Agent': 'Mozilla/5.0 (compatible; ScraperApp/1.0)',
|
|
16
|
+
},
|
|
17
|
+
});
|
|
18
|
+
if (!response.ok) {
|
|
19
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
20
|
+
}
|
|
21
|
+
const html = await response.text();
|
|
22
|
+
const dom = new jsdom_1.JSDOM(html, { url });
|
|
23
|
+
const reader = new readability_1.Readability(dom.window.document);
|
|
24
|
+
const article = reader.parse();
|
|
25
|
+
if (!article) {
|
|
26
|
+
return null;
|
|
27
|
+
}
|
|
28
|
+
// Calculate reading time (200 words per minute)
|
|
29
|
+
const wordCount = article.textContent?.split(/\s+/).length || 0;
|
|
30
|
+
const readingTime = Math.ceil(wordCount / 200);
|
|
31
|
+
// Extract publish time from metadata
|
|
32
|
+
const publishedTime = this.extractPublishedTime(dom.window.document);
|
|
33
|
+
return {
|
|
34
|
+
title: article.title ?? undefined,
|
|
35
|
+
byline: article.byline ?? undefined,
|
|
36
|
+
content: article.content ?? undefined,
|
|
37
|
+
textContent: article.textContent ?? undefined,
|
|
38
|
+
length: article.length ?? undefined,
|
|
39
|
+
excerpt: article.excerpt ?? undefined,
|
|
40
|
+
siteName: article.siteName ?? undefined,
|
|
41
|
+
publishedTime,
|
|
42
|
+
readingTime,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
catch (error) {
|
|
46
|
+
console.error(`[ContentExtractor] Failed to extract from ${url}:`, error);
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Extract published time from document metadata
|
|
52
|
+
*/
|
|
53
|
+
extractPublishedTime(doc) {
|
|
54
|
+
// Try various metadata sources
|
|
55
|
+
const selectors = [
|
|
56
|
+
'meta[property="article:published_time"]',
|
|
57
|
+
'meta[name="article:published_time"]',
|
|
58
|
+
'meta[property="og:published_time"]',
|
|
59
|
+
'meta[name="published_time"]',
|
|
60
|
+
'meta[name="date"]',
|
|
61
|
+
'time[datetime]',
|
|
62
|
+
];
|
|
63
|
+
for (const selector of selectors) {
|
|
64
|
+
const element = doc.querySelector(selector);
|
|
65
|
+
if (element) {
|
|
66
|
+
const content = element.getAttribute('content') || element.getAttribute('datetime');
|
|
67
|
+
if (content)
|
|
68
|
+
return content;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
exports.ContentExtractor = ContentExtractor;
|
|
75
|
+
//# sourceMappingURL=content-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-extractor.js","sourceRoot":"","sources":["../../lib/content-extractor.ts"],"names":[],"mappings":";AAAA;;;GAGG;;;AAEH,sDAAmD;AACnD,iCAA8B;AAG9B,MAAa,gBAAgB;IAC3B,KAAK,CAAC,cAAc,CAAC,GAAW;QAC9B,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE;oBACP,YAAY,EAAE,0CAA0C;iBACzD;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YACrE,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,IAAI,yBAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YACpD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;YAE/B,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,OAAO,IAAI,CAAC;YACd,CAAC;YAED,gDAAgD;YAChD,MAAM,SAAS,GAAG,OAAO,CAAC,WAAW,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;YAChE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC;YAE/C,qCAAqC;YACrC,MAAM,aAAa,GAAG,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YAErE,OAAO;gBACL,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,SAAS;gBACjC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,SAAS;gBACrC,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,SAAS;gBAC7C,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,SAAS;gBACrC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,SAAS;gBACvC,aAAa;gBACb,WAAW;aACZ,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,6CAA6C,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YAC1E,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACK,oBAAoB,CAAC,GAAa;QACxC,+BAA+B;QAC/B,MAAM,SAAS,GAAG;YAChB,yCAAyC;YACzC,qCAAqC;YACrC,oCAAoC;YACpC,6BAA6B;YAC7B,mBAAmB;YACnB,gBAAgB;SACjB,CAAC;QAEF,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;YAC5C,IAAI,OAAO,EAAE,CAAC;gBACZ,MAAM,OAAO,GAAG,OAAO,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,OAAO,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;gBACpF,IAAI,OAAO;oBAAE,OAAO,OAAO,CAAC;YAC9B,CAAC;QACH,CAAC;QAED,OAAO,SAAS,CAAC;IACnB,CAAC;CACF;AAtED,4CAsEC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Convert HTML to clean Markdown
|
|
3
|
+
* - Preserves headings, bold, lists, links, code blocks
|
|
4
|
+
* - Strips navigation, forms, UI elements
|
|
5
|
+
* - Smart paragraph detection
|
|
6
|
+
*/
|
|
7
|
+
export declare function htmlToMarkdown(html: string): string;
|
|
8
|
+
/**
|
|
9
|
+
* Strip non-article content from HTML before conversion
|
|
10
|
+
* Removes navigation, forms, UI elements
|
|
11
|
+
*/
|
|
12
|
+
export declare function stripNonArticleContent(html: string): string;
|
|
13
|
+
/**
|
|
14
|
+
* Convert HTML to Markdown with full cleaning
|
|
15
|
+
* This is the main function developers should use
|
|
16
|
+
*/
|
|
17
|
+
export declare function convertToMarkdown(html: string, options?: {
|
|
18
|
+
cleanNonArticle?: boolean;
|
|
19
|
+
smartParagraphs?: boolean;
|
|
20
|
+
}): string;
|
|
21
|
+
//# sourceMappingURL=html-to-markdown.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-to-markdown.d.ts","sourceRoot":"","sources":["../../../lib/formatters/html-to-markdown.ts"],"names":[],"mappings":"AAEA;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAgDnD;AA0DD;;;GAGG;AACH,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAyB3D;AAED;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,GAAE;IACvD,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,eAAe,CAAC,EAAE,OAAO,CAAC;CACtB,GAAG,MAAM,CAiBd"}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.htmlToMarkdown = htmlToMarkdown;
|
|
7
|
+
exports.stripNonArticleContent = stripNonArticleContent;
|
|
8
|
+
exports.convertToMarkdown = convertToMarkdown;
|
|
9
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
10
|
+
/**
|
|
11
|
+
* Convert HTML to clean Markdown
|
|
12
|
+
* - Preserves headings, bold, lists, links, code blocks
|
|
13
|
+
* - Strips navigation, forms, UI elements
|
|
14
|
+
* - Smart paragraph detection
|
|
15
|
+
*/
|
|
16
|
+
function htmlToMarkdown(html) {
|
|
17
|
+
if (!html)
|
|
18
|
+
return '';
|
|
19
|
+
// Create Turndown service with custom rules
|
|
20
|
+
const turndownService = new turndown_1.default({
|
|
21
|
+
headingStyle: 'atx', // Use # for headings
|
|
22
|
+
codeBlockStyle: 'fenced', // Use ``` for code blocks
|
|
23
|
+
bulletListMarker: '-', // Use - for lists
|
|
24
|
+
emDelimiter: '*', // Use * for emphasis
|
|
25
|
+
strongDelimiter: '**', // Use ** for strong
|
|
26
|
+
});
|
|
27
|
+
// Remove unwanted elements before conversion
|
|
28
|
+
turndownService.remove([
|
|
29
|
+
'script',
|
|
30
|
+
'style',
|
|
31
|
+
'nav',
|
|
32
|
+
'header',
|
|
33
|
+
'footer',
|
|
34
|
+
'aside',
|
|
35
|
+
'form',
|
|
36
|
+
'button',
|
|
37
|
+
'input',
|
|
38
|
+
'select',
|
|
39
|
+
'textarea',
|
|
40
|
+
'iframe',
|
|
41
|
+
'noscript',
|
|
42
|
+
]);
|
|
43
|
+
// Custom rule: Clean up attributes from elements
|
|
44
|
+
turndownService.addRule('cleanAttributes', {
|
|
45
|
+
filter: ['div', 'span', 'p', 'section', 'article'],
|
|
46
|
+
replacement: (content) => {
|
|
47
|
+
// Just return content, stripping the wrapper
|
|
48
|
+
return content;
|
|
49
|
+
},
|
|
50
|
+
});
|
|
51
|
+
// Convert HTML to Markdown
|
|
52
|
+
let markdown = turndownService.turndown(html);
|
|
53
|
+
// Post-processing: Smart paragraph detection
|
|
54
|
+
markdown = smartParagraphDetection(markdown);
|
|
55
|
+
// Clean up excessive whitespace
|
|
56
|
+
markdown = normalizeWhitespace(markdown);
|
|
57
|
+
return markdown;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Smart paragraph detection
|
|
61
|
+
* Adds proper spacing between sections
|
|
62
|
+
*/
|
|
63
|
+
function smartParagraphDetection(markdown) {
|
|
64
|
+
// Split into lines
|
|
65
|
+
const lines = markdown.split('\n');
|
|
66
|
+
const result = [];
|
|
67
|
+
for (let i = 0; i < lines.length; i++) {
|
|
68
|
+
const line = lines[i];
|
|
69
|
+
const prevLine = i > 0 ? lines[i - 1] : '';
|
|
70
|
+
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
|
|
71
|
+
result.push(line);
|
|
72
|
+
// Add extra line break after headings
|
|
73
|
+
if (line.match(/^#{1,6}\s/) && nextLine && !nextLine.match(/^#{1,6}\s/)) {
|
|
74
|
+
result.push('');
|
|
75
|
+
}
|
|
76
|
+
// Add extra line break before headings
|
|
77
|
+
if (nextLine.match(/^#{1,6}\s/) && line && !line.match(/^#{1,6}\s/) && !prevLine.match(/^$/)) {
|
|
78
|
+
result.push('');
|
|
79
|
+
}
|
|
80
|
+
// Add line break after lists
|
|
81
|
+
if (line.match(/^[-*+]\s/) && nextLine && !nextLine.match(/^[-*+]\s/) && !nextLine.match(/^$/)) {
|
|
82
|
+
result.push('');
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return result.join('\n');
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Normalize whitespace
|
|
89
|
+
* - Remove excessive line breaks (more than 2)
|
|
90
|
+
* - Trim lines
|
|
91
|
+
*/
|
|
92
|
+
function normalizeWhitespace(markdown) {
|
|
93
|
+
// Replace 3+ consecutive line breaks with just 2
|
|
94
|
+
markdown = markdown.replace(/\n{3,}/g, '\n\n');
|
|
95
|
+
// Trim each line
|
|
96
|
+
markdown = markdown
|
|
97
|
+
.split('\n')
|
|
98
|
+
.map(line => line.trim())
|
|
99
|
+
.join('\n');
|
|
100
|
+
// Remove leading/trailing whitespace
|
|
101
|
+
markdown = markdown.trim();
|
|
102
|
+
return markdown;
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Strip non-article content from HTML before conversion
|
|
106
|
+
* Removes navigation, forms, UI elements
|
|
107
|
+
*/
|
|
108
|
+
function stripNonArticleContent(html) {
|
|
109
|
+
if (!html)
|
|
110
|
+
return '';
|
|
111
|
+
// Remove elements with specific classes/IDs that indicate non-article content
|
|
112
|
+
const nonArticlePatterns = [
|
|
113
|
+
/<nav\b[^>]*>.*?<\/nav>/gi,
|
|
114
|
+
/<header\b[^>]*>.*?<\/header>/gi,
|
|
115
|
+
/<footer\b[^>]*>.*?<\/footer>/gi,
|
|
116
|
+
/<aside\b[^>]*>.*?<\/aside>/gi,
|
|
117
|
+
/<form\b[^>]*>.*?<\/form>/gi,
|
|
118
|
+
/<div[^>]*class="[^"]*(?:nav|menu|sidebar|advertisement|ads|social|share|comment|popup|modal)[^"]*"[^>]*>.*?<\/div>/gi,
|
|
119
|
+
/<div[^>]*id="[^"]*(?:nav|menu|sidebar|advertisement|ads|social|share|comment|popup|modal)[^"]*"[^>]*>.*?<\/div>/gi,
|
|
120
|
+
];
|
|
121
|
+
let cleaned = html;
|
|
122
|
+
for (const pattern of nonArticlePatterns) {
|
|
123
|
+
cleaned = cleaned.replace(pattern, '');
|
|
124
|
+
}
|
|
125
|
+
// Remove all class, id, and data attributes
|
|
126
|
+
cleaned = cleaned.replace(/\s*class="[^"]*"/gi, '');
|
|
127
|
+
cleaned = cleaned.replace(/\s*id="[^"]*"/gi, '');
|
|
128
|
+
cleaned = cleaned.replace(/\s*data-[^=]*="[^"]*"/gi, '');
|
|
129
|
+
return cleaned;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Convert HTML to Markdown with full cleaning
|
|
133
|
+
* This is the main function developers should use
|
|
134
|
+
*/
|
|
135
|
+
function convertToMarkdown(html, options = {}) {
|
|
136
|
+
const { cleanNonArticle = true, smartParagraphs = true, } = options;
|
|
137
|
+
let processedHtml = html;
|
|
138
|
+
// Step 1: Strip non-article content if requested
|
|
139
|
+
if (cleanNonArticle) {
|
|
140
|
+
processedHtml = stripNonArticleContent(processedHtml);
|
|
141
|
+
}
|
|
142
|
+
// Step 2: Convert to Markdown
|
|
143
|
+
const markdown = htmlToMarkdown(processedHtml);
|
|
144
|
+
return markdown;
|
|
145
|
+
}
|
|
146
|
+
//# sourceMappingURL=html-to-markdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-to-markdown.js","sourceRoot":"","sources":["../../../lib/formatters/html-to-markdown.ts"],"names":[],"mappings":";;;;;AAQA,wCAgDC;AA8DD,wDAyBC;AAMD,8CAoBC;AAzKD,wDAAuC;AAEvC;;;;;GAKG;AACH,SAAgB,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IAErB,4CAA4C;IAC5C,MAAM,eAAe,GAAG,IAAI,kBAAe,CAAC;QAC1C,YAAY,EAAE,KAAK,EAAE,qBAAqB;QAC1C,cAAc,EAAE,QAAQ,EAAE,0BAA0B;QACpD,gBAAgB,EAAE,GAAG,EAAE,kBAAkB;QACzC,WAAW,EAAE,GAAG,EAAE,qBAAqB;QACvC,eAAe,EAAE,IAAI,EAAE,oBAAoB;KAC5C,CAAC,CAAC;IAEH,6CAA6C;IAC7C,eAAe,CAAC,MAAM,CAAC;QACrB,QAAQ;QACR,OAAO;QACP,KAAK;QACL,QAAQ;QACR,QAAQ;QACR,OAAO;QACP,MAAM;QACN,QAAQ;QACR,OAAO;QACP,QAAQ;QACR,UAAU;QACV,QAAQ;QACR,UAAU;KACX,CAAC,CAAC;IAEH,iDAAiD;IACjD,eAAe,CAAC,OAAO,CAAC,iBAAiB,EAAE;QACzC,MAAM,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC;QAClD,WAAW,EAAE,CAAC,OAAO,EAAE,EAAE;YACvB,6CAA6C;YAC7C,OAAO,OAAO,CAAC;QACjB,CAAC;KACF,CAAC,CAAC;IAEH,2BAA2B;IAC3B,IAAI,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAE9C,6CAA6C;IAC7C,QAAQ,GAAG,uBAAuB,CAAC,QAAQ,CAAC,CAAC;IAE7C,gCAAgC;IAChC,QAAQ,GAAG,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAEzC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;GAGG;AACH,SAAS,uBAAuB,CAAC,QAAgB;IAC/C,mBAAmB;IACnB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3C,MAAM,QAAQ,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE1D,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAElB,sCAAsC;QACtC,IAAI,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,IAAI,QAAQ,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAC,EAAE,CAAC;YACxE,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClB,CAAC;QAED,uCAAuC;QACvC,IAAI,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7F,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClB,CAAC;QAED,6BAA6B;QAC7B,IAAI,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,QAAQ,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;YAC/F,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC3B,CAAC;AAED;;;;GAIG;AACH,SAAS,mBAAmB,CAAC,QAAgB;IAC3C,iDAAiD;IACjD,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE/C,iBAAiB;IACjB,QAAQ,GAAG,QAAQ;SAChB,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SACxB,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,qCAAqC;IACrC,QAAQ,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC;IAE3B,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;GAGG;AACH,SAAgB,sBAAsB,CAAC,IAAY;IACjD,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IAErB,8EAA8E;IAC9E,MAAM,kBAAkB,GAAG;QACzB,0BAA0B;QAC1B,gCAAgC;QAChC,gCAAgC;QAChC,8BAA8B;QAC9B,4BAA4B;QAC5B,sHAAsH;QACtH,mHAAmH;KACpH,CAAC;IAEF,IAAI,OAAO,GAAG,IAAI,CAAC;IACnB,KAAK,MAAM,OAAO,IAAI,kBAAkB,EAAE,CAAC;QACzC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,4CAA4C;IAC5C,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,oBAAoB,EAAE,EAAE,CAAC,CAAC;IACpD,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;IACjD,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAC;IAEzD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;GAGG;AACH,SAAgB,iBAAiB,CAAC,IAAY,EAAE,UAG5C,EAAE;IACJ,MAAM,EACJ,eAAe,GAAG,IAAI,EACtB,eAAe,GAAG,IAAI,GACvB,GAAG,OAAO,CAAC;IAEZ,IAAI,aAAa,GAAG,IAAI,CAAC;IAEzB,iDAAiD;IACjD,IAAI,eAAe,EAAE,CAAC;QACpB,aAAa,GAAG,sBAAsB,CAAC,aAAa,CAAC,CAAC;IACxD,CAAC;IAED,8BAA8B;IAC9B,MAAM,QAAQ,GAAG,cAAc,CAAC,aAAa,CAAC,CAAC;IAE/C,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text cleanup utilities
|
|
3
|
+
* Normalize whitespace, remove excessive line breaks, clean HTML entities
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Clean text content
|
|
7
|
+
* - Normalize whitespace between paragraphs
|
|
8
|
+
* - Remove excessive line breaks
|
|
9
|
+
* - Decode HTML entities
|
|
10
|
+
* - Trim redundant spaces
|
|
11
|
+
*/
|
|
12
|
+
export declare function cleanText(text: string): string;
|
|
13
|
+
/**
|
|
14
|
+
* Decode HTML entities ( , &, etc.)
|
|
15
|
+
*/
|
|
16
|
+
export declare function decodeHTMLEntities(text: string): string;
|
|
17
|
+
/**
|
|
18
|
+
* Normalize whitespace
|
|
19
|
+
* - Replace multiple spaces with single space
|
|
20
|
+
* - Replace tabs with spaces
|
|
21
|
+
* - Remove trailing/leading whitespace from lines
|
|
22
|
+
*/
|
|
23
|
+
export declare function normalizeWhitespace(text: string): string;
|
|
24
|
+
/**
|
|
25
|
+
* Detect paragraph boundaries and add proper spacing
|
|
26
|
+
* Looks for sentence endings followed by capital letters
|
|
27
|
+
*/
|
|
28
|
+
export declare function detectParagraphs(text: string): string;
|
|
29
|
+
/**
|
|
30
|
+
* Remove URLs from text
|
|
31
|
+
* Useful for cleaning up citations or references
|
|
32
|
+
*/
|
|
33
|
+
export declare function removeUrls(text: string): string;
|
|
34
|
+
/**
|
|
35
|
+
* Truncate text to a maximum length
|
|
36
|
+
* Breaks at word boundaries and adds ellipsis
|
|
37
|
+
*/
|
|
38
|
+
export declare function truncateText(text: string, maxLength: number): string;
|
|
39
|
+
/**
|
|
40
|
+
* Extract plain text from HTML
|
|
41
|
+
* Quick and dirty HTML stripping
|
|
42
|
+
*/
|
|
43
|
+
export declare function stripHTML(html: string): string;
|
|
44
|
+
//# sourceMappingURL=text-cleaner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-cleaner.d.ts","sourceRoot":"","sources":["../../../lib/formatters/text-cleaner.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;;;;;GAMG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAkB9C;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAgCvD;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiBxD;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CA2BrD;AAED;;;GAGG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAE/C;AAED;;;GAGG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM,CAYpE;AAED;;;GAGG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAO9C"}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Text cleanup utilities
|
|
4
|
+
* Normalize whitespace, remove excessive line breaks, clean HTML entities
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.cleanText = cleanText;
|
|
8
|
+
exports.decodeHTMLEntities = decodeHTMLEntities;
|
|
9
|
+
exports.normalizeWhitespace = normalizeWhitespace;
|
|
10
|
+
exports.detectParagraphs = detectParagraphs;
|
|
11
|
+
exports.removeUrls = removeUrls;
|
|
12
|
+
exports.truncateText = truncateText;
|
|
13
|
+
exports.stripHTML = stripHTML;
|
|
14
|
+
/**
|
|
15
|
+
* Clean text content
|
|
16
|
+
* - Normalize whitespace between paragraphs
|
|
17
|
+
* - Remove excessive line breaks
|
|
18
|
+
* - Decode HTML entities
|
|
19
|
+
* - Trim redundant spaces
|
|
20
|
+
*/
|
|
21
|
+
function cleanText(text) {
|
|
22
|
+
if (!text)
|
|
23
|
+
return '';
|
|
24
|
+
let cleaned = text;
|
|
25
|
+
// Step 1: Decode HTML entities
|
|
26
|
+
cleaned = decodeHTMLEntities(cleaned);
|
|
27
|
+
// Step 2: Normalize whitespace
|
|
28
|
+
cleaned = normalizeWhitespace(cleaned);
|
|
29
|
+
// Step 3: Smart paragraph detection
|
|
30
|
+
cleaned = detectParagraphs(cleaned);
|
|
31
|
+
// Step 4: Trim
|
|
32
|
+
cleaned = cleaned.trim();
|
|
33
|
+
return cleaned;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Decode HTML entities ( , &, etc.)
|
|
37
|
+
*/
|
|
38
|
+
function decodeHTMLEntities(text) {
|
|
39
|
+
const entities = {
|
|
40
|
+
' ': ' ',
|
|
41
|
+
'&': '&',
|
|
42
|
+
'<': '<',
|
|
43
|
+
'>': '>',
|
|
44
|
+
'"': '"',
|
|
45
|
+
''': "'",
|
|
46
|
+
''': "'",
|
|
47
|
+
'–': '–',
|
|
48
|
+
'—': '—',
|
|
49
|
+
'…': '…',
|
|
50
|
+
'“': '"',
|
|
51
|
+
'”': '"',
|
|
52
|
+
'‘': '\u2018',
|
|
53
|
+
'’': '\u2019',
|
|
54
|
+
};
|
|
55
|
+
let decoded = text;
|
|
56
|
+
for (const [entity, char] of Object.entries(entities)) {
|
|
57
|
+
decoded = decoded.replace(new RegExp(entity, 'g'), char);
|
|
58
|
+
}
|
|
59
|
+
// Handle numeric entities ({, ᨫ)
|
|
60
|
+
decoded = decoded.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)));
|
|
61
|
+
decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (_, code) => String.fromCharCode(parseInt(code, 16)));
|
|
62
|
+
return decoded;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Normalize whitespace
|
|
66
|
+
* - Replace multiple spaces with single space
|
|
67
|
+
* - Replace tabs with spaces
|
|
68
|
+
* - Remove trailing/leading whitespace from lines
|
|
69
|
+
*/
|
|
70
|
+
function normalizeWhitespace(text) {
|
|
71
|
+
// Replace tabs with spaces
|
|
72
|
+
let normalized = text.replace(/\t/g, ' ');
|
|
73
|
+
// Replace multiple spaces with single space (but preserve line breaks)
|
|
74
|
+
normalized = normalized.replace(/ {2,}/g, ' ');
|
|
75
|
+
// Trim each line
|
|
76
|
+
normalized = normalized
|
|
77
|
+
.split('\n')
|
|
78
|
+
.map(line => line.trim())
|
|
79
|
+
.join('\n');
|
|
80
|
+
// Replace 3+ consecutive line breaks with just 2
|
|
81
|
+
normalized = normalized.replace(/\n{3,}/g, '\n\n');
|
|
82
|
+
return normalized;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Detect paragraph boundaries and add proper spacing
|
|
86
|
+
* Looks for sentence endings followed by capital letters
|
|
87
|
+
*/
|
|
88
|
+
function detectParagraphs(text) {
|
|
89
|
+
// Split by existing line breaks
|
|
90
|
+
const lines = text.split('\n').filter(line => line.trim().length > 0);
|
|
91
|
+
const result = [];
|
|
92
|
+
for (let i = 0; i < lines.length; i++) {
|
|
93
|
+
const line = lines[i];
|
|
94
|
+
const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
|
|
95
|
+
result.push(line);
|
|
96
|
+
// Add paragraph break if:
|
|
97
|
+
// 1. Current line ends with sentence-ending punctuation (. ! ?)
|
|
98
|
+
// 2. Next line starts with capital letter or number
|
|
99
|
+
// 3. Lines are not too short (likely not a title)
|
|
100
|
+
if (line.match(/[.!?]$/) &&
|
|
101
|
+
nextLine.match(/^[A-Z0-9]/) &&
|
|
102
|
+
line.length > 40 && // Avoid breaking after short lines
|
|
103
|
+
nextLine.length > 20) {
|
|
104
|
+
result.push(''); // Add empty line for paragraph break
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return result.join('\n');
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Remove URLs from text
|
|
111
|
+
* Useful for cleaning up citations or references
|
|
112
|
+
*/
|
|
113
|
+
function removeUrls(text) {
|
|
114
|
+
return text.replace(/https?:\/\/[^\s]+/g, '');
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Truncate text to a maximum length
|
|
118
|
+
* Breaks at word boundaries and adds ellipsis
|
|
119
|
+
*/
|
|
120
|
+
function truncateText(text, maxLength) {
|
|
121
|
+
if (text.length <= maxLength)
|
|
122
|
+
return text;
|
|
123
|
+
// Find the last space before maxLength
|
|
124
|
+
const truncated = text.substring(0, maxLength);
|
|
125
|
+
const lastSpace = truncated.lastIndexOf(' ');
|
|
126
|
+
if (lastSpace > 0) {
|
|
127
|
+
return truncated.substring(0, lastSpace) + '…';
|
|
128
|
+
}
|
|
129
|
+
return truncated + '…';
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Extract plain text from HTML
|
|
133
|
+
* Quick and dirty HTML stripping
|
|
134
|
+
*/
|
|
135
|
+
function stripHTML(html) {
|
|
136
|
+
return html
|
|
137
|
+
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
|
138
|
+
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
|
139
|
+
.replace(/<[^>]+>/g, '')
|
|
140
|
+
.replace(/\s+/g, ' ')
|
|
141
|
+
.trim();
|
|
142
|
+
}
|
|
143
|
+
//# sourceMappingURL=text-cleaner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-cleaner.js","sourceRoot":"","sources":["../../../lib/formatters/text-cleaner.ts"],"names":[],"mappings":";AAAA;;;GAGG;;AASH,8BAkBC;AAKD,gDAgCC;AAQD,kDAiBC;AAMD,4CA2BC;AAMD,gCAEC;AAMD,oCAYC;AAMD,8BAOC;AA/JD;;;;;;GAMG;AACH,SAAgB,SAAS,CAAC,IAAY;IACpC,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IAErB,IAAI,OAAO,GAAG,IAAI,CAAC;IAEnB,+BAA+B;IAC/B,OAAO,GAAG,kBAAkB,CAAC,OAAO,CAAC,CAAC;IAEtC,+BAA+B;IAC/B,OAAO,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAEvC,oCAAoC;IACpC,OAAO,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;IAEpC,eAAe;IACf,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAEzB,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,SAAgB,kBAAkB,CAAC,IAAY;IAC7C,MAAM,QAAQ,GAA2B;QACvC,QAAQ,EAAE,GAAG;QACb,OAAO,EAAE,GAAG;QACZ,MAAM,EAAE,GAAG;QACX,MAAM,EAAE,GAAG;QACX,QAAQ,EAAE,GAAG;QACb,QAAQ,EAAE,GAAG;QACb,QAAQ,EAAE,GAAG;QACb,SAAS,EAAE,GAAG;QACd,SAAS,EAAE,GAAG;QACd,UAAU,EAAE,GAAG;QACf,SAAS,EAAE,GAAG;QACd,SAAS,EAAE,GAAG;QACd,SAAS,EAAE,QAAQ;QACnB,SAAS,EAAE,QAAQ;KACpB,CAAC;IAEF,IAAI,OAAO,GAAG,IAAI,CAAC;IACnB,KAAK,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QACtD,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAC3D,CAAC;IAED,6CAA6C;IAC7C,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CACjD,MAAM,CAAC,YAAY,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CACxC,CAAC;IACF,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,mBAAmB,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CACzD,MAAM,CAAC,YAAY,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CACxC,CAAC;IAEF,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;;;GAKG;AACH,SAAgB,mBAAmB,CAAC,IAAY;IAC9C,2BAA2B;IAC3B,IAAI,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;IAE1C,uEAAuE;IACvE,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;IAE/C,iBAAiB;IACjB,UAAU,GAAG,UAAU;SACpB,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SACxB,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,iDAAiD;IACjD,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAEnD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;GAGG;AACH,SAAgB,gBAAgB,CAAC,IAAY;IAC3C,gCAAgC;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAEtE,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,QAAQ,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE1D,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAElB,0BAA0B;QAC1B,gEAAgE;QAChE,oDAAoD;QACpD,kDAAkD;QAClD,IACE,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC;YACpB,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAC;YAC3B,IAAI,CAAC,MAAM,GAAG,EAAE,IAAI,mCAAmC;YACvD,QAAQ,CAAC,MAAM,GAAG,EAAE,EACpB,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,qCAAqC;QACxD,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC3B,CAAC;AAED;;;GAGG;AACH,SAAgB,UAAU,CAAC,IAAY;IACrC,OAAO,IAAI,CAAC,OAAO,CAAC,oBAAoB,EAAE,EAAE,CAAC,CAAC;AAChD,CAAC;AAED;;;GAGG;AACH,SAAgB,YAAY,CAAC,IAAY,EAAE,SAAiB;IAC1D,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS;QAAE,OAAO,IAAI,CAAC;IAE1C,uCAAuC;IACvC,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAC/C,MAAM,SAAS,GAAG,SAAS,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAE7C,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;QAClB,OAAO,SAAS,CAAC,SAAS,CAAC,CAAC,EAAE,SAAS,CAAC,GAAG,GAAG,CAAC;IACjD,CAAC;IAED,OAAO,SAAS,GAAG,GAAG,CAAC;AACzB,CAAC;AAED;;;GAGG;AACH,SAAgB,SAAS,CAAC,IAAY;IACpC,OAAO,IAAI;SACR,OAAO,CAAC,qDAAqD,EAAE,EAAE,CAAC;SAClE,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC;SAC/D,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;SACvB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CAAC;AACZ,CAAC"}
|