@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @package blog-content-scraper SDK
|
|
3
|
+
*
|
|
4
|
+
* Intelligent web scraper for extracting blog/news content from any website.
|
|
5
|
+
* Supports RSS feeds, sitemaps, and HTML scraping with automatic detection.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* import { scrapeWebsite, createRateLimiter } from 'blog-content-scraper';
|
|
10
|
+
*
|
|
11
|
+
* // Simple usage
|
|
12
|
+
* const result = await scrapeWebsite('https://techcrunch.com');
|
|
13
|
+
* console.log(result.articles);
|
|
14
|
+
*
|
|
15
|
+
* // With options
|
|
16
|
+
* const result = await scrapeWebsite('https://example.com/blog', {
|
|
17
|
+
* maxArticles: 10,
|
|
18
|
+
* extractFullContent: true,
|
|
19
|
+
* qualityThreshold: 0.5
|
|
20
|
+
* });
|
|
21
|
+
* ```
|
|
22
|
+
*/
|
|
23
|
+
export { globalSourceOrchestrator, SourceOrchestrator, CandidateArticleSchema, SourceConfigSchema } from './source-orchestrator';
|
|
24
|
+
export type { CandidateArticle, SourceConfig, OrchestrationResult } from './source-orchestrator';
|
|
25
|
+
export { ScrapingRateLimiter, createRateLimiter, globalRateLimiter, RATE_LIMITER_PRESETS, type RateLimiterConfig, type RateLimiterPreset } from './scraping-rate-limiter';
|
|
26
|
+
export { calculateArticleQualityScore, DEFAULT_QUALITY_CONFIG, DEFAULT_DENY_PATHS, DEFAULT_ALLOW_PATHS, isNonEnglishLocalePath } from './quality-scorer';
|
|
27
|
+
export { CircuitBreaker, circuitBreakers } from './circuit-breaker';
|
|
28
|
+
export type { ScrapedArticle, ScraperTestResult, ScraperTestRequest, ProgressState, ProgressStage, QualityScoreConfig, ContentValidation, ExtractedContent, ScraperPlugin } from './types';
|
|
29
|
+
export { convertToMarkdown } from './formatters/html-to-markdown';
|
|
30
|
+
export { cleanText, stripHTML } from './formatters/text-cleaner';
|
|
31
|
+
export { globalRSSDiscovery, type DiscoveredFeed } from './web-scrapers/rss-discovery';
|
|
32
|
+
export { globalSitemapParser, type SitemapEntry } from './web-scrapers/sitemap-parser';
|
|
33
|
+
export { HTMLScraper, type ExtractedArticle, type ScrapingConfig } from './web-scrapers/html-scraper';
|
|
34
|
+
export { ContentExtractor } from './web-scrapers/content-extractor';
|
|
35
|
+
/**
|
|
36
|
+
* Simplified scraping function for common use cases
|
|
37
|
+
*
|
|
38
|
+
* @param url - Website URL to scrape
|
|
39
|
+
* @param options - Scraping options
|
|
40
|
+
* @returns Promise with scraped articles and metadata
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* ```typescript
|
|
44
|
+
* const result = await scrapeWebsite('https://techcrunch.com', {
|
|
45
|
+
* maxArticles: 5,
|
|
46
|
+
* extractFullContent: true
|
|
47
|
+
* });
|
|
48
|
+
*
|
|
49
|
+
* for (const article of result.articles) {
|
|
50
|
+
* console.log(article.title, article.url);
|
|
51
|
+
* }
|
|
52
|
+
* ```
|
|
53
|
+
*/
|
|
54
|
+
export declare function scrapeWebsite(url: string, options?: {
|
|
55
|
+
/** Maximum articles to return (default: 10) */
|
|
56
|
+
maxArticles?: number;
|
|
57
|
+
/** Extract full article content (default: true) */
|
|
58
|
+
extractFullContent?: boolean;
|
|
59
|
+
/** Minimum quality score 0-1 (default: 0.5) */
|
|
60
|
+
qualityThreshold?: number;
|
|
61
|
+
/** Source type: 'auto' | 'rss' | 'sitemap' | 'html' (default: 'auto') */
|
|
62
|
+
sourceType?: 'auto' | 'rss' | 'sitemap' | 'html';
|
|
63
|
+
/** URL patterns to allow (e.g., ['/blog/*', '/news/*']) */
|
|
64
|
+
allowPaths?: string[];
|
|
65
|
+
/** URL patterns to deny (e.g., ['/about', '/careers/*']) */
|
|
66
|
+
denyPaths?: string[];
|
|
67
|
+
/** Progress callback for long-running operations */
|
|
68
|
+
onProgress?: (completed: number, total: number) => void;
|
|
69
|
+
/** Abort signal for cancellation */
|
|
70
|
+
signal?: AbortSignal;
|
|
71
|
+
}): Promise<{
|
|
72
|
+
url: string;
|
|
73
|
+
detectedType: "rss" | "sitemap" | "html";
|
|
74
|
+
articles: {
|
|
75
|
+
url: string;
|
|
76
|
+
title: string;
|
|
77
|
+
publishedDate: string;
|
|
78
|
+
description: string | undefined;
|
|
79
|
+
fullContent: string | undefined;
|
|
80
|
+
fullContentMarkdown: string | undefined;
|
|
81
|
+
fullContentText: string | undefined;
|
|
82
|
+
confidence: number;
|
|
83
|
+
source: "rss" | "sitemap" | "html" | "discovery";
|
|
84
|
+
qualityScore: number;
|
|
85
|
+
metadata: Record<string, any> | undefined;
|
|
86
|
+
}[];
|
|
87
|
+
stats: {
|
|
88
|
+
totalDiscovered: number;
|
|
89
|
+
afterQualityFilter: number;
|
|
90
|
+
processingTime: number;
|
|
91
|
+
};
|
|
92
|
+
discoveredFeeds: import("./web-scrapers/rss-discovery").DiscoveredFeed[] | undefined;
|
|
93
|
+
discoveredSitemaps: string[] | undefined;
|
|
94
|
+
errors: string[];
|
|
95
|
+
}>;
|
|
96
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../lib/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAGH,OAAO,EACL,wBAAwB,EACxB,kBAAkB,EAClB,sBAAsB,EACtB,kBAAkB,EACnB,MAAM,uBAAuB,CAAC;AAE/B,YAAY,EACV,gBAAgB,EAChB,YAAY,EACZ,mBAAmB,EACpB,MAAM,uBAAuB,CAAC;AAG/B,OAAO,EACL,mBAAmB,EACnB,iBAAiB,EACjB,iBAAiB,EACjB,oBAAoB,EACpB,KAAK,iBAAiB,EACtB,KAAK,iBAAiB,EACvB,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EACL,4BAA4B,EAC5B,sBAAsB,EACtB,kBAAkB,EAClB,mBAAmB,EACnB,sBAAsB,EACvB,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACL,cAAc,EACd,eAAe,EAChB,MAAM,mBAAmB,CAAC;AAG3B,YAAY,EACV,cAAc,EACd,iBAAiB,EACjB,kBAAkB,EAClB,aAAa,EACb,aAAa,EACb,kBAAkB,EAClB,iBAAiB,EACjB,gBAAgB,EAChB,aAAa,EACd,MAAM,SAAS,CAAC;AAGjB,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,2BAA2B,CAAC;AAGjE,OAAO,EAAE,kBAAkB,EAAE,KAAK,cAAc,EAAE,MAAM,8BAA8B,CAAC;AACvF,OAAO,EAAE,mBAAmB,EAAE,KAAK,YAAY,EAAE,MAAM,+BAA+B,CAAC;AACvF,OAAO,EAAE,WAAW,EAAE,KAAK,gBAAgB,EAAE,KAAK,cAAc,EAAE,MAAM,6BAA6B,CAAC;AACtG,OAAO,EAAE,gBAAgB,EAAE,MAAM,kCAAkC,CAAC;AAEpE;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,aAAa,CACjC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IACP,+CAA+C;IAC/C,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,mDAAmD;IACnD,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,+CAA+C;IAC/C,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,yEAAyE;IACzE,UAAU,CAAC,EAAE,MAAM,GAAG,KAAK,GAAG,SAAS,GAAG,MAAM,CAAC;IACjD,2DAA2D;IAC3D,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,4DAA4D;IAC5D,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,oDAAoD;IACpD,UAAU,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACxD,oCAAoC;IACpC,MAAM,CAAC,EAAE,WAAW,CAAC;CACjB;;;;;;;;;;;;;;;;;;;;;;;;GA2FP"}
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @package blog-content-scraper SDK
|
|
4
|
+
*
|
|
5
|
+
* Intelligent web scraper for extracting blog/news content from any website.
|
|
6
|
+
* Supports RSS feeds, sitemaps, and HTML scraping with automatic detection.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```typescript
|
|
10
|
+
* import { scrapeWebsite, createRateLimiter } from 'blog-content-scraper';
|
|
11
|
+
*
|
|
12
|
+
* // Simple usage
|
|
13
|
+
* const result = await scrapeWebsite('https://techcrunch.com');
|
|
14
|
+
* console.log(result.articles);
|
|
15
|
+
*
|
|
16
|
+
* // With options
|
|
17
|
+
* const result = await scrapeWebsite('https://example.com/blog', {
|
|
18
|
+
* maxArticles: 10,
|
|
19
|
+
* extractFullContent: true,
|
|
20
|
+
* qualityThreshold: 0.5
|
|
21
|
+
* });
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
25
|
+
if (k2 === undefined) k2 = k;
|
|
26
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
27
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
28
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
29
|
+
}
|
|
30
|
+
Object.defineProperty(o, k2, desc);
|
|
31
|
+
}) : (function(o, m, k, k2) {
|
|
32
|
+
if (k2 === undefined) k2 = k;
|
|
33
|
+
o[k2] = m[k];
|
|
34
|
+
}));
|
|
35
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
36
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
37
|
+
}) : function(o, v) {
|
|
38
|
+
o["default"] = v;
|
|
39
|
+
});
|
|
40
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
41
|
+
var ownKeys = function(o) {
|
|
42
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
43
|
+
var ar = [];
|
|
44
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
45
|
+
return ar;
|
|
46
|
+
};
|
|
47
|
+
return ownKeys(o);
|
|
48
|
+
};
|
|
49
|
+
return function (mod) {
|
|
50
|
+
if (mod && mod.__esModule) return mod;
|
|
51
|
+
var result = {};
|
|
52
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
53
|
+
__setModuleDefault(result, mod);
|
|
54
|
+
return result;
|
|
55
|
+
};
|
|
56
|
+
})();
|
|
57
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
58
|
+
exports.ContentExtractor = exports.HTMLScraper = exports.globalSitemapParser = exports.globalRSSDiscovery = exports.stripHTML = exports.cleanText = exports.convertToMarkdown = exports.circuitBreakers = exports.CircuitBreaker = exports.isNonEnglishLocalePath = exports.DEFAULT_ALLOW_PATHS = exports.DEFAULT_DENY_PATHS = exports.DEFAULT_QUALITY_CONFIG = exports.calculateArticleQualityScore = exports.RATE_LIMITER_PRESETS = exports.globalRateLimiter = exports.createRateLimiter = exports.ScrapingRateLimiter = exports.SourceConfigSchema = exports.CandidateArticleSchema = exports.SourceOrchestrator = exports.globalSourceOrchestrator = void 0;
|
|
59
|
+
exports.scrapeWebsite = scrapeWebsite;
|
|
60
|
+
// Main orchestrator
|
|
61
|
+
var source_orchestrator_1 = require("./source-orchestrator");
|
|
62
|
+
Object.defineProperty(exports, "globalSourceOrchestrator", { enumerable: true, get: function () { return source_orchestrator_1.globalSourceOrchestrator; } });
|
|
63
|
+
Object.defineProperty(exports, "SourceOrchestrator", { enumerable: true, get: function () { return source_orchestrator_1.SourceOrchestrator; } });
|
|
64
|
+
Object.defineProperty(exports, "CandidateArticleSchema", { enumerable: true, get: function () { return source_orchestrator_1.CandidateArticleSchema; } });
|
|
65
|
+
Object.defineProperty(exports, "SourceConfigSchema", { enumerable: true, get: function () { return source_orchestrator_1.SourceConfigSchema; } });
|
|
66
|
+
// Rate limiter with presets
|
|
67
|
+
var scraping_rate_limiter_1 = require("./scraping-rate-limiter");
|
|
68
|
+
Object.defineProperty(exports, "ScrapingRateLimiter", { enumerable: true, get: function () { return scraping_rate_limiter_1.ScrapingRateLimiter; } });
|
|
69
|
+
Object.defineProperty(exports, "createRateLimiter", { enumerable: true, get: function () { return scraping_rate_limiter_1.createRateLimiter; } });
|
|
70
|
+
Object.defineProperty(exports, "globalRateLimiter", { enumerable: true, get: function () { return scraping_rate_limiter_1.globalRateLimiter; } });
|
|
71
|
+
Object.defineProperty(exports, "RATE_LIMITER_PRESETS", { enumerable: true, get: function () { return scraping_rate_limiter_1.RATE_LIMITER_PRESETS; } });
|
|
72
|
+
// Quality scoring
|
|
73
|
+
var quality_scorer_1 = require("./quality-scorer");
|
|
74
|
+
Object.defineProperty(exports, "calculateArticleQualityScore", { enumerable: true, get: function () { return quality_scorer_1.calculateArticleQualityScore; } });
|
|
75
|
+
Object.defineProperty(exports, "DEFAULT_QUALITY_CONFIG", { enumerable: true, get: function () { return quality_scorer_1.DEFAULT_QUALITY_CONFIG; } });
|
|
76
|
+
Object.defineProperty(exports, "DEFAULT_DENY_PATHS", { enumerable: true, get: function () { return quality_scorer_1.DEFAULT_DENY_PATHS; } });
|
|
77
|
+
Object.defineProperty(exports, "DEFAULT_ALLOW_PATHS", { enumerable: true, get: function () { return quality_scorer_1.DEFAULT_ALLOW_PATHS; } });
|
|
78
|
+
Object.defineProperty(exports, "isNonEnglishLocalePath", { enumerable: true, get: function () { return quality_scorer_1.isNonEnglishLocalePath; } });
|
|
79
|
+
// Circuit breaker for resilience
|
|
80
|
+
var circuit_breaker_1 = require("./circuit-breaker");
|
|
81
|
+
Object.defineProperty(exports, "CircuitBreaker", { enumerable: true, get: function () { return circuit_breaker_1.CircuitBreaker; } });
|
|
82
|
+
Object.defineProperty(exports, "circuitBreakers", { enumerable: true, get: function () { return circuit_breaker_1.circuitBreakers; } });
|
|
83
|
+
// Formatters
|
|
84
|
+
var html_to_markdown_1 = require("./formatters/html-to-markdown");
|
|
85
|
+
Object.defineProperty(exports, "convertToMarkdown", { enumerable: true, get: function () { return html_to_markdown_1.convertToMarkdown; } });
|
|
86
|
+
var text_cleaner_1 = require("./formatters/text-cleaner");
|
|
87
|
+
Object.defineProperty(exports, "cleanText", { enumerable: true, get: function () { return text_cleaner_1.cleanText; } });
|
|
88
|
+
Object.defineProperty(exports, "stripHTML", { enumerable: true, get: function () { return text_cleaner_1.stripHTML; } });
|
|
89
|
+
// Re-export scraper components for advanced usage
|
|
90
|
+
var rss_discovery_1 = require("./web-scrapers/rss-discovery");
|
|
91
|
+
Object.defineProperty(exports, "globalRSSDiscovery", { enumerable: true, get: function () { return rss_discovery_1.globalRSSDiscovery; } });
|
|
92
|
+
var sitemap_parser_1 = require("./web-scrapers/sitemap-parser");
|
|
93
|
+
Object.defineProperty(exports, "globalSitemapParser", { enumerable: true, get: function () { return sitemap_parser_1.globalSitemapParser; } });
|
|
94
|
+
var html_scraper_1 = require("./web-scrapers/html-scraper");
|
|
95
|
+
Object.defineProperty(exports, "HTMLScraper", { enumerable: true, get: function () { return html_scraper_1.HTMLScraper; } });
|
|
96
|
+
var content_extractor_1 = require("./web-scrapers/content-extractor");
|
|
97
|
+
Object.defineProperty(exports, "ContentExtractor", { enumerable: true, get: function () { return content_extractor_1.ContentExtractor; } });
|
|
98
|
+
/**
|
|
99
|
+
* Simplified scraping function for common use cases
|
|
100
|
+
*
|
|
101
|
+
* @param url - Website URL to scrape
|
|
102
|
+
* @param options - Scraping options
|
|
103
|
+
* @returns Promise with scraped articles and metadata
|
|
104
|
+
*
|
|
105
|
+
* @example
|
|
106
|
+
* ```typescript
|
|
107
|
+
* const result = await scrapeWebsite('https://techcrunch.com', {
|
|
108
|
+
* maxArticles: 5,
|
|
109
|
+
* extractFullContent: true
|
|
110
|
+
* });
|
|
111
|
+
*
|
|
112
|
+
* for (const article of result.articles) {
|
|
113
|
+
* console.log(article.title, article.url);
|
|
114
|
+
* }
|
|
115
|
+
* ```
|
|
116
|
+
*/
|
|
117
|
+
async function scrapeWebsite(url, options = {}) {
|
|
118
|
+
const { globalSourceOrchestrator } = await Promise.resolve().then(() => __importStar(require('./source-orchestrator')));
|
|
119
|
+
const { calculateArticleQualityScore, DEFAULT_DENY_PATHS } = await Promise.resolve().then(() => __importStar(require('./quality-scorer')));
|
|
120
|
+
const { convertToMarkdown } = await Promise.resolve().then(() => __importStar(require('./formatters/html-to-markdown')));
|
|
121
|
+
const { cleanText, stripHTML } = await Promise.resolve().then(() => __importStar(require('./formatters/text-cleaner')));
|
|
122
|
+
const { maxArticles = 10, extractFullContent = true, qualityThreshold = 0.5, sourceType = 'auto', allowPaths = [], denyPaths = DEFAULT_DENY_PATHS, onProgress, signal } = options;
|
|
123
|
+
// Check for cancellation
|
|
124
|
+
if (signal?.aborted) {
|
|
125
|
+
throw new Error('Operation cancelled');
|
|
126
|
+
}
|
|
127
|
+
// Process source
|
|
128
|
+
const result = await globalSourceOrchestrator.processSource(url, {
|
|
129
|
+
sourceType,
|
|
130
|
+
allowPaths,
|
|
131
|
+
denyPaths,
|
|
132
|
+
detectOnly: false
|
|
133
|
+
});
|
|
134
|
+
// Check for cancellation
|
|
135
|
+
if (signal?.aborted) {
|
|
136
|
+
throw new Error('Operation cancelled');
|
|
137
|
+
}
|
|
138
|
+
// Enhance with full content if requested
|
|
139
|
+
let articles = result.articles.slice(0, maxArticles);
|
|
140
|
+
if (extractFullContent && articles.length > 0) {
|
|
141
|
+
articles = await globalSourceOrchestrator.enhanceWithFullContent(articles, maxArticles, { onProgress });
|
|
142
|
+
}
|
|
143
|
+
// Calculate quality scores and format output
|
|
144
|
+
const scoredArticles = articles.map(article => {
|
|
145
|
+
const extracted = {
|
|
146
|
+
title: article.title,
|
|
147
|
+
excerpt: article.excerpt,
|
|
148
|
+
content: article.content,
|
|
149
|
+
textContent: article.content || '',
|
|
150
|
+
publishedTime: article.publishedAt.toISOString()
|
|
151
|
+
};
|
|
152
|
+
const qualityScore = calculateArticleQualityScore(extracted);
|
|
153
|
+
const fullContent = extractFullContent ? article.content : undefined;
|
|
154
|
+
return {
|
|
155
|
+
url: article.url,
|
|
156
|
+
title: article.title,
|
|
157
|
+
publishedDate: article.publishedAt.toISOString(),
|
|
158
|
+
description: article.excerpt,
|
|
159
|
+
fullContent,
|
|
160
|
+
fullContentMarkdown: fullContent ? convertToMarkdown(fullContent) : undefined,
|
|
161
|
+
fullContentText: fullContent ? cleanText(stripHTML(fullContent)) : undefined,
|
|
162
|
+
confidence: article.confidence,
|
|
163
|
+
source: article.source,
|
|
164
|
+
qualityScore,
|
|
165
|
+
metadata: article.metadata
|
|
166
|
+
};
|
|
167
|
+
});
|
|
168
|
+
// Filter by quality threshold
|
|
169
|
+
const filteredArticles = scoredArticles.filter(a => a.qualityScore >= qualityThreshold);
|
|
170
|
+
return {
|
|
171
|
+
url,
|
|
172
|
+
detectedType: result.sourceInfo.detectedType,
|
|
173
|
+
articles: filteredArticles,
|
|
174
|
+
stats: {
|
|
175
|
+
totalDiscovered: result.articles.length,
|
|
176
|
+
afterQualityFilter: filteredArticles.length,
|
|
177
|
+
processingTime: result.processingTime
|
|
178
|
+
},
|
|
179
|
+
discoveredFeeds: result.sourceInfo.discoveredFeeds,
|
|
180
|
+
discoveredSitemaps: result.sourceInfo.discoveredSitemaps,
|
|
181
|
+
errors: result.errors
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../lib/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAmFH,sCA8GC;AA/LD,oBAAoB;AACpB,6DAK+B;AAJ7B,+HAAA,wBAAwB,OAAA;AACxB,yHAAA,kBAAkB,OAAA;AAClB,6HAAA,sBAAsB,OAAA;AACtB,yHAAA,kBAAkB,OAAA;AASpB,4BAA4B;AAC5B,iEAOiC;AAN/B,4HAAA,mBAAmB,OAAA;AACnB,0HAAA,iBAAiB,OAAA;AACjB,0HAAA,iBAAiB,OAAA;AACjB,6HAAA,oBAAoB,OAAA;AAKtB,kBAAkB;AAClB,mDAM0B;AALxB,8HAAA,4BAA4B,OAAA;AAC5B,wHAAA,sBAAsB,OAAA;AACtB,oHAAA,kBAAkB,OAAA;AAClB,qHAAA,mBAAmB,OAAA;AACnB,wHAAA,sBAAsB,OAAA;AAGxB,iCAAiC;AACjC,qDAG2B;AAFzB,iHAAA,cAAc,OAAA;AACd,kHAAA,eAAe,OAAA;AAgBjB,aAAa;AACb,kEAAkE;AAAzD,qHAAA,iBAAiB,OAAA;AAC1B,0DAAiE;AAAxD,yGAAA,SAAS,OAAA;AAAE,yGAAA,SAAS,OAAA;AAE7B,kDAAkD;AAClD,8DAAuF;AAA9E,mHAAA,kBAAkB,OAAA;AAC3B,gEAAuF;AAA9E,qHAAA,mBAAmB,OAAA;AAC5B,4DAAsG;AAA7F,2GAAA,WAAW,OAAA;AACpB,sEAAoE;AAA3D,qHAAA,gBAAgB,OAAA;AAEzB;;;;;;;;;;;;;;;;;;GAkBG;AACI,KAAK,UAAU,aAAa,CACjC,GAAW,EACX,UAiBI,EAAE;IAEN,MAAM,EAAE,wBAAwB,EAAE,GAAG,wDAAa,uBAAuB,GAAC,CAAC;IAC3E,MAAM,EAAE,4BAA4B,EAAE,kBAAkB,EAAE,GAAG,wDAAa,kBAAkB,GAAC,CAAC;IAC9F,MAAM,EAAE,iBAAiB,EAAE,GAAG,wDAAa,+BAA+B,GAAC,CAAC;IAC5E,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,GAAG,wDAAa,2BAA2B,GAAC,CAAC;IAE3E,MAAM,EACJ,WAAW,GAAG,EAAE,EAChB,kBAAkB,GAAG,IAAI,EACzB,gBAAgB,GAAG,GAAG,EACtB,UAAU,GAAG,MAAM,EACnB,UAAU,GAAG,EAAE,EACf,SAAS,GAAG,kBAAkB,EAC9B,UAAU,EACV,MAAM,EACP,GAAG,OAAO,CAAC;IAEZ,yBAAyB;IACzB,IAAI,MAAM,EAAE,OAAO,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACzC,CAAC;IAED,iBAAiB;IACjB,MAAM,MAAM,GAAG,MAAM,wBAAwB,CAAC,aAAa,CAAC,GAAG,EAAE;QAC/D,UAAU;QACV,UAAU;QACV,SAAS;QACT,UAAU,EAAE,KAAK;KAClB,CAAC,CAAC;IAEH,yBAAyB;IACzB,IAAI,MAAM,EAAE,OAAO,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACzC,CAAC;IAED,yCAAyC;IACzC,IAAI,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC,CAAC;IAErD,IAAI,kBAAkB,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC9C,QAAQ,GAAG,MAAM,wBAAwB,CAAC,sBAAsB,CAC9D,QAAQ,EACR,WAAW,EACX,EAAE,UAAU,EAAE,CACf,CAAC;IACJ,CAAC;IAED,6CAA6C;IAC7C,MAAM,cAAc,GAAG,QAAQ,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE;QAC5C,MAAM,SAAS,GAAG;YAChB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,WAAW,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE;YAClC,aAAa,EAAE,OAAO,CAAC,WAAW,CAAC,WAAW,EAAE;SACjD,CAAC;QAEF,MAAM,YAAY,GAAG,4BAA4B,CAAC,SAAS,CAAC,CAAC;QAC7D,MAAM,WAAW,GAAG,kBAAkB,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC;QAErE,OAAO;YACL,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,aAAa,EAAE,OAAO,CAAC,WAAW,CAAC,WAAW,EAAE;YAChD,WAAW,EAAE,OAAO,CAAC,OAAO;YAC5B,WAAW;YACX,mBAAmB,EAAE,WAAW,CAAC,CAAC,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,SAAS;YAC7E,eAAe,EAAE,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;YAC5E,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,YAAY;YACZ,QAAQ,EAAE,OAAO,CAAC,QAAQ;SAC3B,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,8BAA8B;IAC9B,MAAM,gBAAgB,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,YAAY,IAAI,gBAAgB,CAAC,CAAC;IAExF,OAAO;QACL,GAAG;QACH,YAAY,EAAE,MAAM,CAAC,UAAU,CAAC,YAAY;QAC5C,QAAQ,EAAE,gBAAgB;QAC1B,KAAK,EAAE;YACL,eAAe,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM;YACvC,kBAAkB,EAAE,gBAAgB,CAAC,MAAM;YAC3C,cAAc,EAAE,MAAM,CAAC,cAAc;SACtC;QACD,eAAe,EAAE,MAAM,CAAC,UAAU,CAAC,eAAe;QAClD,kBAAkB,EAAE,MAAM,CAAC,UAAU,CAAC,kBAAkB;QACxD,MAAM,EAAE,MAAM,CAAC,MAAM;KACtB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @package @tyroneross/scraper-testing
|
|
3
|
+
* Article quality scoring system
|
|
4
|
+
*
|
|
5
|
+
* No LLM required - uses metadata and content signals to determine article quality
|
|
6
|
+
*/
|
|
7
|
+
import { ExtractedContent, QualityScoreConfig, ContentValidation } from './types';
|
|
8
|
+
/**
|
|
9
|
+
* Default quality score configuration
|
|
10
|
+
* These weights were optimized through testing with 1,788 real articles
|
|
11
|
+
*/
|
|
12
|
+
export declare const DEFAULT_QUALITY_CONFIG: Required<QualityScoreConfig>;
|
|
13
|
+
/**
|
|
14
|
+
* Default patterns to block non-article pages
|
|
15
|
+
* These cover common non-article paths across websites
|
|
16
|
+
*/
|
|
17
|
+
export declare const DEFAULT_DENY_PATHS: string[];
|
|
18
|
+
/**
|
|
19
|
+
* Default patterns for content sections (blog, news, articles)
|
|
20
|
+
* Used for allow-listing paths when scraping
|
|
21
|
+
*/
|
|
22
|
+
export declare const DEFAULT_ALLOW_PATHS: string[];
|
|
23
|
+
/**
|
|
24
|
+
* Validate content quality (Tier 2 filtering)
|
|
25
|
+
* Checks length, title quality, and text-to-HTML ratio
|
|
26
|
+
*
|
|
27
|
+
* @param extracted - Extracted content from article
|
|
28
|
+
* @returns Validation result with score and reasons
|
|
29
|
+
*/
|
|
30
|
+
export declare function validateContent(extracted: ExtractedContent): ContentValidation;
|
|
31
|
+
/**
|
|
32
|
+
* Calculate article quality score (Tier 3 filtering)
|
|
33
|
+
*
|
|
34
|
+
* Score breakdown:
|
|
35
|
+
* - Content validation (60%): Length, title quality, text-to-HTML ratio
|
|
36
|
+
* - Publication date (12%): Articles should have timestamps
|
|
37
|
+
* - Author/byline (8%): Professional articles cite authors
|
|
38
|
+
* - Schema.org metadata (8%): Structured data indicates article pages
|
|
39
|
+
* - Reading time (12%): Substantial content (2+ min read)
|
|
40
|
+
*
|
|
41
|
+
* @param extracted - Extracted content from article
|
|
42
|
+
* @param config - Optional quality score configuration
|
|
43
|
+
* @returns Quality score between 0-1
|
|
44
|
+
*/
|
|
45
|
+
export declare function calculateArticleQualityScore(extracted: ExtractedContent, config?: QualityScoreConfig): number;
|
|
46
|
+
/**
|
|
47
|
+
* Check if a path should be filtered out (non-US-English locale)
|
|
48
|
+
*
|
|
49
|
+
* Returns true (should filter) for:
|
|
50
|
+
* - /fr-be/, /de-de/, /ja-jp/, /zh-cn/ (non-English locales)
|
|
51
|
+
* - /en-gb/, /en-au/, /en-ca/ (non-US English locales)
|
|
52
|
+
*
|
|
53
|
+
* Returns false (should keep) for:
|
|
54
|
+
* - /en-us/ (US English only)
|
|
55
|
+
* - /blog/, /news/, /articles/ (no locale prefix - default to US English)
|
|
56
|
+
*/
|
|
57
|
+
export declare function isNonEnglishLocalePath(path: string): boolean;
|
|
58
|
+
/**
|
|
59
|
+
* Check if a URL should be denied based on path patterns
|
|
60
|
+
*
|
|
61
|
+
* @param url - URL to check
|
|
62
|
+
* @param denyPaths - Patterns to deny (supports wildcards with *)
|
|
63
|
+
* @returns True if URL should be denied
|
|
64
|
+
*/
|
|
65
|
+
export declare function shouldDenyUrl(url: string, denyPaths?: string[]): boolean;
|
|
66
|
+
/**
|
|
67
|
+
* Get quality score breakdown for debugging
|
|
68
|
+
* Useful for understanding why an article scored a certain way
|
|
69
|
+
*
|
|
70
|
+
* @param extracted - Extracted content from article
|
|
71
|
+
* @param config - Optional quality score configuration
|
|
72
|
+
* @returns Breakdown of quality score components
|
|
73
|
+
*/
|
|
74
|
+
export declare function getQualityBreakdown(extracted: ExtractedContent, config?: QualityScoreConfig): {
|
|
75
|
+
contentValidation: number;
|
|
76
|
+
publishedDate: number;
|
|
77
|
+
author: number;
|
|
78
|
+
schema: number;
|
|
79
|
+
readingTime: number;
|
|
80
|
+
total: number;
|
|
81
|
+
passesThreshold: boolean;
|
|
82
|
+
};
|
|
83
|
+
//# sourceMappingURL=quality-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"quality-scorer.d.ts","sourceRoot":"","sources":["../../lib/quality-scorer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,MAAM,SAAS,CAAC;AAElF;;;GAGG;AACH,eAAO,MAAM,sBAAsB,EAAE,QAAQ,CAAC,kBAAkB,CAO/D,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,kBAAkB,UA+H9B,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,mBAAmB,UAc/B,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,eAAe,CAAC,SAAS,EAAE,gBAAgB,GAAG,iBAAiB,CAsC9E;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,4BAA4B,CAC1C,SAAS,EAAE,gBAAgB,EAC3B,MAAM,GAAE,kBAAuB,GAC9B,MAAM,CAiDR;AAYD;;;;;;;;;;GAUG;AACH,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAQ5D;AAED;;;;;;GAMG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,GAAE,MAAM,EAAuB,GAAG,OAAO,CAyB5F;AAED;;;;;;;GAOG;AACH,wBAAgB,mBAAmB,CACjC,SAAS,EAAE,gBAAgB,EAC3B,MAAM,GAAE,kBAAuB,GAC9B;IACD,iBAAiB,EAAE,MAAM,CAAC;IAC1B,aAAa,EAAE,MAAM,CAAC;IACtB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,eAAe,EAAE,OAAO,CAAC;CAC1B,CA8CA"}
|