@tyroneross/blog-scraper 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +254 -279
  3. package/dist/lib/circuit-breaker.d.ts +29 -0
  4. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  5. package/dist/lib/circuit-breaker.js +89 -0
  6. package/dist/lib/circuit-breaker.js.map +1 -0
  7. package/dist/lib/content-extractor.d.ts +13 -0
  8. package/dist/lib/content-extractor.d.ts.map +1 -0
  9. package/dist/lib/content-extractor.js +75 -0
  10. package/dist/lib/content-extractor.js.map +1 -0
  11. package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
  12. package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
  13. package/dist/lib/formatters/html-to-markdown.js +146 -0
  14. package/dist/lib/formatters/html-to-markdown.js.map +1 -0
  15. package/dist/lib/formatters/text-cleaner.d.ts +44 -0
  16. package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
  17. package/dist/lib/formatters/text-cleaner.js +143 -0
  18. package/dist/lib/formatters/text-cleaner.js.map +1 -0
  19. package/dist/lib/index.d.ts +96 -0
  20. package/dist/lib/index.d.ts.map +1 -0
  21. package/dist/lib/index.js +184 -0
  22. package/dist/lib/index.js.map +1 -0
  23. package/dist/lib/quality-scorer.d.ts +83 -0
  24. package/dist/lib/quality-scorer.d.ts.map +1 -0
  25. package/dist/lib/quality-scorer.js +376 -0
  26. package/dist/lib/quality-scorer.js.map +1 -0
  27. package/dist/lib/rss-utils.d.ts +31 -0
  28. package/dist/lib/rss-utils.d.ts.map +1 -0
  29. package/dist/lib/rss-utils.js +175 -0
  30. package/dist/lib/rss-utils.js.map +1 -0
  31. package/dist/lib/scraping-rate-limiter.d.ts +52 -0
  32. package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
  33. package/dist/lib/scraping-rate-limiter.js +238 -0
  34. package/dist/lib/scraping-rate-limiter.js.map +1 -0
  35. package/dist/lib/source-orchestrator.d.ts +306 -0
  36. package/dist/lib/source-orchestrator.d.ts.map +1 -0
  37. package/dist/lib/source-orchestrator.js +840 -0
  38. package/dist/lib/source-orchestrator.js.map +1 -0
  39. package/dist/lib/types.d.ts +143 -0
  40. package/dist/lib/types.d.ts.map +1 -0
  41. package/dist/lib/types.js +7 -0
  42. package/dist/lib/types.js.map +1 -0
  43. package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
  44. package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
  45. package/dist/lib/web-scrapers/content-extractor.js +531 -0
  46. package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
  47. package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
  48. package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
  49. package/dist/lib/web-scrapers/html-scraper.js +598 -0
  50. package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
  51. package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
  52. package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
  53. package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
  54. package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
  55. package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
  56. package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
  57. package/dist/lib/web-scrapers/robots-checker.js +285 -0
  58. package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
  59. package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
  60. package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
  61. package/dist/lib/web-scrapers/rss-discovery.js +384 -0
  62. package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
  63. package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
  64. package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
  65. package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
  66. package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
  67. package/package.json +54 -33
  68. package/dist/index.d.mts +0 -949
  69. package/dist/index.d.ts +0 -949
  70. package/dist/index.js +0 -3236
  71. package/dist/index.mjs +0 -3165
@@ -0,0 +1,96 @@
1
+ /**
2
+ * @package blog-content-scraper SDK
3
+ *
4
+ * Intelligent web scraper for extracting blog/news content from any website.
5
+ * Supports RSS feeds, sitemaps, and HTML scraping with automatic detection.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { scrapeWebsite, createRateLimiter } from 'blog-content-scraper';
10
+ *
11
+ * // Simple usage
12
+ * const result = await scrapeWebsite('https://techcrunch.com');
13
+ * console.log(result.articles);
14
+ *
15
+ * // With options
16
+ * const result = await scrapeWebsite('https://example.com/blog', {
17
+ * maxArticles: 10,
18
+ * extractFullContent: true,
19
+ * qualityThreshold: 0.5
20
+ * });
21
+ * ```
22
+ */
23
+ export { globalSourceOrchestrator, SourceOrchestrator, CandidateArticleSchema, SourceConfigSchema } from './source-orchestrator';
24
+ export type { CandidateArticle, SourceConfig, OrchestrationResult } from './source-orchestrator';
25
+ export { ScrapingRateLimiter, createRateLimiter, globalRateLimiter, RATE_LIMITER_PRESETS, type RateLimiterConfig, type RateLimiterPreset } from './scraping-rate-limiter';
26
+ export { calculateArticleQualityScore, DEFAULT_QUALITY_CONFIG, DEFAULT_DENY_PATHS, DEFAULT_ALLOW_PATHS, isNonEnglishLocalePath } from './quality-scorer';
27
+ export { CircuitBreaker, circuitBreakers } from './circuit-breaker';
28
+ export type { ScrapedArticle, ScraperTestResult, ScraperTestRequest, ProgressState, ProgressStage, QualityScoreConfig, ContentValidation, ExtractedContent, ScraperPlugin } from './types';
29
+ export { convertToMarkdown } from './formatters/html-to-markdown';
30
+ export { cleanText, stripHTML } from './formatters/text-cleaner';
31
+ export { globalRSSDiscovery, type DiscoveredFeed } from './web-scrapers/rss-discovery';
32
+ export { globalSitemapParser, type SitemapEntry } from './web-scrapers/sitemap-parser';
33
+ export { HTMLScraper, type ExtractedArticle, type ScrapingConfig } from './web-scrapers/html-scraper';
34
+ export { ContentExtractor } from './web-scrapers/content-extractor';
35
+ /**
36
+ * Simplified scraping function for common use cases
37
+ *
38
+ * @param url - Website URL to scrape
39
+ * @param options - Scraping options
40
+ * @returns Promise with scraped articles and metadata
41
+ *
42
+ * @example
43
+ * ```typescript
44
+ * const result = await scrapeWebsite('https://techcrunch.com', {
45
+ * maxArticles: 5,
46
+ * extractFullContent: true
47
+ * });
48
+ *
49
+ * for (const article of result.articles) {
50
+ * console.log(article.title, article.url);
51
+ * }
52
+ * ```
53
+ */
54
+ export declare function scrapeWebsite(url: string, options?: {
55
+ /** Maximum articles to return (default: 10) */
56
+ maxArticles?: number;
57
+ /** Extract full article content (default: true) */
58
+ extractFullContent?: boolean;
59
+ /** Minimum quality score 0-1 (default: 0.5) */
60
+ qualityThreshold?: number;
61
+ /** Source type: 'auto' | 'rss' | 'sitemap' | 'html' (default: 'auto') */
62
+ sourceType?: 'auto' | 'rss' | 'sitemap' | 'html';
63
+ /** URL patterns to allow (e.g., ['/blog/*', '/news/*']) */
64
+ allowPaths?: string[];
65
+ /** URL patterns to deny (e.g., ['/about', '/careers/*']) */
66
+ denyPaths?: string[];
67
+ /** Progress callback for long-running operations */
68
+ onProgress?: (completed: number, total: number) => void;
69
+ /** Abort signal for cancellation */
70
+ signal?: AbortSignal;
71
+ }): Promise<{
72
+ url: string;
73
+ detectedType: "rss" | "sitemap" | "html";
74
+ articles: {
75
+ url: string;
76
+ title: string;
77
+ publishedDate: string;
78
+ description: string | undefined;
79
+ fullContent: string | undefined;
80
+ fullContentMarkdown: string | undefined;
81
+ fullContentText: string | undefined;
82
+ confidence: number;
83
+ source: "rss" | "sitemap" | "html" | "discovery";
84
+ qualityScore: number;
85
+ metadata: Record<string, any> | undefined;
86
+ }[];
87
+ stats: {
88
+ totalDiscovered: number;
89
+ afterQualityFilter: number;
90
+ processingTime: number;
91
+ };
92
+ discoveredFeeds: import("./web-scrapers/rss-discovery").DiscoveredFeed[] | undefined;
93
+ discoveredSitemaps: string[] | undefined;
94
+ errors: string[];
95
+ }>;
96
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../lib/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAGH,OAAO,EACL,wBAAwB,EACxB,kBAAkB,EAClB,sBAAsB,EACtB,kBAAkB,EACnB,MAAM,uBAAuB,CAAC;AAE/B,YAAY,EACV,gBAAgB,EAChB,YAAY,EACZ,mBAAmB,EACpB,MAAM,uBAAuB,CAAC;AAG/B,OAAO,EACL,mBAAmB,EACnB,iBAAiB,EACjB,iBAAiB,EACjB,oBAAoB,EACpB,KAAK,iBAAiB,EACtB,KAAK,iBAAiB,EACvB,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EACL,4BAA4B,EAC5B,sBAAsB,EACtB,kBAAkB,EAClB,mBAAmB,EACnB,sBAAsB,EACvB,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACL,cAAc,EACd,eAAe,EAChB,MAAM,mBAAmB,CAAC;AAG3B,YAAY,EACV,cAAc,EACd,iBAAiB,EACjB,kBAAkB,EAClB,aAAa,EACb,aAAa,EACb,kBAAkB,EAClB,iBAAiB,EACjB,gBAAgB,EAChB,aAAa,EACd,MAAM,SAAS,CAAC;AAGjB,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,2BAA2B,CAAC;AAGjE,OAAO,EAAE,kBAAkB,EAAE,KAAK,cAAc,EAAE,MAAM,8BAA8B,CAAC;AACvF,OAAO,EAAE,mBAAmB,EAAE,KAAK,YAAY,EAAE,MAAM,+BAA+B,CAAC;AACvF,OAAO,EAAE,WAAW,EAAE,KAAK,gBAAgB,EAAE,KAAK,cAAc,EAAE,MAAM,6BAA6B,CAAC;AACtG,OAAO,EAAE,gBAAgB,EAAE,MAAM,kCAAkC,CAAC;AAEpE;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,aAAa,CACjC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IACP,+CAA+C;IAC/C,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,mDAAmD;IACnD,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,+CAA+C;IAC/C,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,yEAAyE;IACzE,UAAU,CAAC,EAAE,MAAM,GAAG,KAAK,GAAG,SAAS,GAAG,MAAM,CAAC;IACjD,2DAA2D;IAC3D,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,4DAA4D;IAC5D,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,oDAAoD;IACpD,UAAU,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACxD,oCAAoC;IACpC,MAAM,CAAC,EAAE,WAAW,CAAC;CACjB;;;;;;;;;;;;;;;;;;;;;;;;GA2FP"}
@@ -0,0 +1,184 @@
1
+ "use strict";
2
+ /**
3
+ * @package blog-content-scraper SDK
4
+ *
5
+ * Intelligent web scraper for extracting blog/news content from any website.
6
+ * Supports RSS feeds, sitemaps, and HTML scraping with automatic detection.
7
+ *
8
+ * @example
9
+ * ```typescript
10
+ * import { scrapeWebsite, createRateLimiter } from 'blog-content-scraper';
11
+ *
12
+ * // Simple usage
13
+ * const result = await scrapeWebsite('https://techcrunch.com');
14
+ * console.log(result.articles);
15
+ *
16
+ * // With options
17
+ * const result = await scrapeWebsite('https://example.com/blog', {
18
+ * maxArticles: 10,
19
+ * extractFullContent: true,
20
+ * qualityThreshold: 0.5
21
+ * });
22
+ * ```
23
+ */
24
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
25
+ if (k2 === undefined) k2 = k;
26
+ var desc = Object.getOwnPropertyDescriptor(m, k);
27
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
28
+ desc = { enumerable: true, get: function() { return m[k]; } };
29
+ }
30
+ Object.defineProperty(o, k2, desc);
31
+ }) : (function(o, m, k, k2) {
32
+ if (k2 === undefined) k2 = k;
33
+ o[k2] = m[k];
34
+ }));
35
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
36
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
37
+ }) : function(o, v) {
38
+ o["default"] = v;
39
+ });
40
+ var __importStar = (this && this.__importStar) || (function () {
41
+ var ownKeys = function(o) {
42
+ ownKeys = Object.getOwnPropertyNames || function (o) {
43
+ var ar = [];
44
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
45
+ return ar;
46
+ };
47
+ return ownKeys(o);
48
+ };
49
+ return function (mod) {
50
+ if (mod && mod.__esModule) return mod;
51
+ var result = {};
52
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
53
+ __setModuleDefault(result, mod);
54
+ return result;
55
+ };
56
+ })();
57
+ Object.defineProperty(exports, "__esModule", { value: true });
58
+ exports.ContentExtractor = exports.HTMLScraper = exports.globalSitemapParser = exports.globalRSSDiscovery = exports.stripHTML = exports.cleanText = exports.convertToMarkdown = exports.circuitBreakers = exports.CircuitBreaker = exports.isNonEnglishLocalePath = exports.DEFAULT_ALLOW_PATHS = exports.DEFAULT_DENY_PATHS = exports.DEFAULT_QUALITY_CONFIG = exports.calculateArticleQualityScore = exports.RATE_LIMITER_PRESETS = exports.globalRateLimiter = exports.createRateLimiter = exports.ScrapingRateLimiter = exports.SourceConfigSchema = exports.CandidateArticleSchema = exports.SourceOrchestrator = exports.globalSourceOrchestrator = void 0;
59
+ exports.scrapeWebsite = scrapeWebsite;
60
+ // Main orchestrator
61
+ var source_orchestrator_1 = require("./source-orchestrator");
62
+ Object.defineProperty(exports, "globalSourceOrchestrator", { enumerable: true, get: function () { return source_orchestrator_1.globalSourceOrchestrator; } });
63
+ Object.defineProperty(exports, "SourceOrchestrator", { enumerable: true, get: function () { return source_orchestrator_1.SourceOrchestrator; } });
64
+ Object.defineProperty(exports, "CandidateArticleSchema", { enumerable: true, get: function () { return source_orchestrator_1.CandidateArticleSchema; } });
65
+ Object.defineProperty(exports, "SourceConfigSchema", { enumerable: true, get: function () { return source_orchestrator_1.SourceConfigSchema; } });
66
+ // Rate limiter with presets
67
+ var scraping_rate_limiter_1 = require("./scraping-rate-limiter");
68
+ Object.defineProperty(exports, "ScrapingRateLimiter", { enumerable: true, get: function () { return scraping_rate_limiter_1.ScrapingRateLimiter; } });
69
+ Object.defineProperty(exports, "createRateLimiter", { enumerable: true, get: function () { return scraping_rate_limiter_1.createRateLimiter; } });
70
+ Object.defineProperty(exports, "globalRateLimiter", { enumerable: true, get: function () { return scraping_rate_limiter_1.globalRateLimiter; } });
71
+ Object.defineProperty(exports, "RATE_LIMITER_PRESETS", { enumerable: true, get: function () { return scraping_rate_limiter_1.RATE_LIMITER_PRESETS; } });
72
+ // Quality scoring
73
+ var quality_scorer_1 = require("./quality-scorer");
74
+ Object.defineProperty(exports, "calculateArticleQualityScore", { enumerable: true, get: function () { return quality_scorer_1.calculateArticleQualityScore; } });
75
+ Object.defineProperty(exports, "DEFAULT_QUALITY_CONFIG", { enumerable: true, get: function () { return quality_scorer_1.DEFAULT_QUALITY_CONFIG; } });
76
+ Object.defineProperty(exports, "DEFAULT_DENY_PATHS", { enumerable: true, get: function () { return quality_scorer_1.DEFAULT_DENY_PATHS; } });
77
+ Object.defineProperty(exports, "DEFAULT_ALLOW_PATHS", { enumerable: true, get: function () { return quality_scorer_1.DEFAULT_ALLOW_PATHS; } });
78
+ Object.defineProperty(exports, "isNonEnglishLocalePath", { enumerable: true, get: function () { return quality_scorer_1.isNonEnglishLocalePath; } });
79
+ // Circuit breaker for resilience
80
+ var circuit_breaker_1 = require("./circuit-breaker");
81
+ Object.defineProperty(exports, "CircuitBreaker", { enumerable: true, get: function () { return circuit_breaker_1.CircuitBreaker; } });
82
+ Object.defineProperty(exports, "circuitBreakers", { enumerable: true, get: function () { return circuit_breaker_1.circuitBreakers; } });
83
+ // Formatters
84
+ var html_to_markdown_1 = require("./formatters/html-to-markdown");
85
+ Object.defineProperty(exports, "convertToMarkdown", { enumerable: true, get: function () { return html_to_markdown_1.convertToMarkdown; } });
86
+ var text_cleaner_1 = require("./formatters/text-cleaner");
87
+ Object.defineProperty(exports, "cleanText", { enumerable: true, get: function () { return text_cleaner_1.cleanText; } });
88
+ Object.defineProperty(exports, "stripHTML", { enumerable: true, get: function () { return text_cleaner_1.stripHTML; } });
89
+ // Re-export scraper components for advanced usage
90
+ var rss_discovery_1 = require("./web-scrapers/rss-discovery");
91
+ Object.defineProperty(exports, "globalRSSDiscovery", { enumerable: true, get: function () { return rss_discovery_1.globalRSSDiscovery; } });
92
+ var sitemap_parser_1 = require("./web-scrapers/sitemap-parser");
93
+ Object.defineProperty(exports, "globalSitemapParser", { enumerable: true, get: function () { return sitemap_parser_1.globalSitemapParser; } });
94
+ var html_scraper_1 = require("./web-scrapers/html-scraper");
95
+ Object.defineProperty(exports, "HTMLScraper", { enumerable: true, get: function () { return html_scraper_1.HTMLScraper; } });
96
+ var content_extractor_1 = require("./web-scrapers/content-extractor");
97
+ Object.defineProperty(exports, "ContentExtractor", { enumerable: true, get: function () { return content_extractor_1.ContentExtractor; } });
98
+ /**
99
+ * Simplified scraping function for common use cases
100
+ *
101
+ * @param url - Website URL to scrape
102
+ * @param options - Scraping options
103
+ * @returns Promise with scraped articles and metadata
104
+ *
105
+ * @example
106
+ * ```typescript
107
+ * const result = await scrapeWebsite('https://techcrunch.com', {
108
+ * maxArticles: 5,
109
+ * extractFullContent: true
110
+ * });
111
+ *
112
+ * for (const article of result.articles) {
113
+ * console.log(article.title, article.url);
114
+ * }
115
+ * ```
116
+ */
117
+ async function scrapeWebsite(url, options = {}) {
118
+ const { globalSourceOrchestrator } = await Promise.resolve().then(() => __importStar(require('./source-orchestrator')));
119
+ const { calculateArticleQualityScore, DEFAULT_DENY_PATHS } = await Promise.resolve().then(() => __importStar(require('./quality-scorer')));
120
+ const { convertToMarkdown } = await Promise.resolve().then(() => __importStar(require('./formatters/html-to-markdown')));
121
+ const { cleanText, stripHTML } = await Promise.resolve().then(() => __importStar(require('./formatters/text-cleaner')));
122
+ const { maxArticles = 10, extractFullContent = true, qualityThreshold = 0.5, sourceType = 'auto', allowPaths = [], denyPaths = DEFAULT_DENY_PATHS, onProgress, signal } = options;
123
+ // Check for cancellation
124
+ if (signal?.aborted) {
125
+ throw new Error('Operation cancelled');
126
+ }
127
+ // Process source
128
+ const result = await globalSourceOrchestrator.processSource(url, {
129
+ sourceType,
130
+ allowPaths,
131
+ denyPaths,
132
+ detectOnly: false
133
+ });
134
+ // Check for cancellation
135
+ if (signal?.aborted) {
136
+ throw new Error('Operation cancelled');
137
+ }
138
+ // Enhance with full content if requested
139
+ let articles = result.articles.slice(0, maxArticles);
140
+ if (extractFullContent && articles.length > 0) {
141
+ articles = await globalSourceOrchestrator.enhanceWithFullContent(articles, maxArticles, { onProgress });
142
+ }
143
+ // Calculate quality scores and format output
144
+ const scoredArticles = articles.map(article => {
145
+ const extracted = {
146
+ title: article.title,
147
+ excerpt: article.excerpt,
148
+ content: article.content,
149
+ textContent: article.content || '',
150
+ publishedTime: article.publishedAt.toISOString()
151
+ };
152
+ const qualityScore = calculateArticleQualityScore(extracted);
153
+ const fullContent = extractFullContent ? article.content : undefined;
154
+ return {
155
+ url: article.url,
156
+ title: article.title,
157
+ publishedDate: article.publishedAt.toISOString(),
158
+ description: article.excerpt,
159
+ fullContent,
160
+ fullContentMarkdown: fullContent ? convertToMarkdown(fullContent) : undefined,
161
+ fullContentText: fullContent ? cleanText(stripHTML(fullContent)) : undefined,
162
+ confidence: article.confidence,
163
+ source: article.source,
164
+ qualityScore,
165
+ metadata: article.metadata
166
+ };
167
+ });
168
+ // Filter by quality threshold
169
+ const filteredArticles = scoredArticles.filter(a => a.qualityScore >= qualityThreshold);
170
+ return {
171
+ url,
172
+ detectedType: result.sourceInfo.detectedType,
173
+ articles: filteredArticles,
174
+ stats: {
175
+ totalDiscovered: result.articles.length,
176
+ afterQualityFilter: filteredArticles.length,
177
+ processingTime: result.processingTime
178
+ },
179
+ discoveredFeeds: result.sourceInfo.discoveredFeeds,
180
+ discoveredSitemaps: result.sourceInfo.discoveredSitemaps,
181
+ errors: result.errors
182
+ };
183
+ }
184
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../lib/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAmFH,sCA8GC;AA/LD,oBAAoB;AACpB,6DAK+B;AAJ7B,+HAAA,wBAAwB,OAAA;AACxB,yHAAA,kBAAkB,OAAA;AAClB,6HAAA,sBAAsB,OAAA;AACtB,yHAAA,kBAAkB,OAAA;AASpB,4BAA4B;AAC5B,iEAOiC;AAN/B,4HAAA,mBAAmB,OAAA;AACnB,0HAAA,iBAAiB,OAAA;AACjB,0HAAA,iBAAiB,OAAA;AACjB,6HAAA,oBAAoB,OAAA;AAKtB,kBAAkB;AAClB,mDAM0B;AALxB,8HAAA,4BAA4B,OAAA;AAC5B,wHAAA,sBAAsB,OAAA;AACtB,oHAAA,kBAAkB,OAAA;AAClB,qHAAA,mBAAmB,OAAA;AACnB,wHAAA,sBAAsB,OAAA;AAGxB,iCAAiC;AACjC,qDAG2B;AAFzB,iHAAA,cAAc,OAAA;AACd,kHAAA,eAAe,OAAA;AAgBjB,aAAa;AACb,kEAAkE;AAAzD,qHAAA,iBAAiB,OAAA;AAC1B,0DAAiE;AAAxD,yGAAA,SAAS,OAAA;AAAE,yGAAA,SAAS,OAAA;AAE7B,kDAAkD;AAClD,8DAAuF;AAA9E,mHAAA,kBAAkB,OAAA;AAC3B,gEAAuF;AAA9E,qHAAA,mBAAmB,OAAA;AAC5B,4DAAsG;AAA7F,2GAAA,WAAW,OAAA;AACpB,sEAAoE;AAA3D,qHAAA,gBAAgB,OAAA;AAEzB;;;;;;;;;;;;;;;;;;GAkBG;AACI,KAAK,UAAU,aAAa,CACjC,GAAW,EACX,UAiBI,EAAE;IAEN,MAAM,EAAE,wBAAwB,EAAE,GAAG,wDAAa,uBAAuB,GAAC,CAAC;IAC3E,MAAM,EAAE,4BAA4B,EAAE,kBAAkB,EAAE,GAAG,wDAAa,kBAAkB,GAAC,CAAC;IAC9F,MAAM,EAAE,iBAAiB,EAAE,GAAG,wDAAa,+BAA+B,GAAC,CAAC;IAC5E,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,GAAG,wDAAa,2BAA2B,GAAC,CAAC;IAE3E,MAAM,EACJ,WAAW,GAAG,EAAE,EAChB,kBAAkB,GAAG,IAAI,EACzB,gBAAgB,GAAG,GAAG,EACtB,UAAU,GAAG,MAAM,EACnB,UAAU,GAAG,EAAE,EACf,SAAS,GAAG,kBAAkB,EAC9B,UAAU,EACV,MAAM,EACP,GAAG,OAAO,CAAC;IAEZ,yBAAyB;IACzB,IAAI,MAAM,EAAE,OAAO,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACzC,CAAC;IAED,iBAAiB;IACjB,MAAM,MAAM,GAAG,MAAM,wBAAwB,CAAC,aAAa,CAAC,GAAG,EAAE;QAC/D,UAAU;QACV,UAAU;QACV,SAAS;QACT,UAAU,EAAE,KAAK;KAClB,CAAC,CAAC;IAEH,yBAAyB;IACzB,IAAI,MAAM,EAAE,OAAO,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACzC,CAAC;IAED,yCAAyC;IACzC,IAAI,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC,CAAC;IAErD,IAAI,kBAAkB,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC9C,QAAQ,GAAG,MAAM,wBAAwB,CAAC,sBAAsB,CAC9D,QAAQ,EACR,WAAW,EACX,EAAE,UAAU,EAAE,CACf,CAAC;IACJ,CAAC;IAED,6CAA6C;IAC7C,MAAM,cAAc,GAAG,QAAQ,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE;QAC5C,MAAM,SAAS,GAAG;YAChB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,WAAW,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE;YAClC,aAAa,EAAE,OAAO,CAAC,WAAW,CAAC,WAAW,EAAE;SACjD,CAAC;QAEF,MAAM,YAAY,GAAG,4BAA4B,CAAC,SAAS,CAAC,CAAC;QAC7D,MAAM,WAAW,GAAG,kBAAkB,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC;QAErE,OAAO;YACL,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,aAAa,EAAE,OAAO,CAAC,WAAW,CAAC,WAAW,EAAE;YAChD,WAAW,EAAE,OAAO,CAAC,OAAO;YAC5B,WAAW;YACX,mBAAmB,EAAE,WAAW,CAAC,CAAC,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,SAAS;YAC7E,eAAe,EAAE,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;YAC5E,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,YAAY;YACZ,QAAQ,EAAE,OAAO,CAAC,QAAQ;SAC3B,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,8BAA8B;IAC9B,MAAM,gBAAgB,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,YAAY,IAAI,gBAAgB,CAAC,CAAC;IAExF,OAAO;QACL,GAAG;QACH,YAAY,EAAE,MAAM,CAAC,UAAU,CAAC,YAAY;QAC5C,QAAQ,EAAE,gBAAgB;QAC1B,KAAK,EAAE;YACL,eAAe,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM;YACvC,kBAAkB,EAAE,gBAAgB,CAAC,MAAM;YAC3C,cAAc,EAAE,MAAM,CAAC,cAAc;SACtC;QACD,eAAe,EAAE,MAAM,CAAC,UAAU,CAAC,eAAe;QAClD,kBAAkB,EAAE,MAAM,CAAC,UAAU,CAAC,kBAAkB;QACxD,MAAM,EAAE,MAAM,CAAC,MAAM;KACtB,CAAC;AACJ,CAAC"}
@@ -0,0 +1,83 @@
1
+ /**
2
+ * @package @tyroneross/scraper-testing
3
+ * Article quality scoring system
4
+ *
5
+ * No LLM required - uses metadata and content signals to determine article quality
6
+ */
7
+ import { ExtractedContent, QualityScoreConfig, ContentValidation } from './types';
8
+ /**
9
+ * Default quality score configuration
10
+ * These weights were optimized through testing with 1,788 real articles
11
+ */
12
+ export declare const DEFAULT_QUALITY_CONFIG: Required<QualityScoreConfig>;
13
+ /**
14
+ * Default patterns to block non-article pages
15
+ * These cover common non-article paths across websites
16
+ */
17
+ export declare const DEFAULT_DENY_PATHS: string[];
18
+ /**
19
+ * Default patterns for content sections (blog, news, articles)
20
+ * Used for allow-listing paths when scraping
21
+ */
22
+ export declare const DEFAULT_ALLOW_PATHS: string[];
23
+ /**
24
+ * Validate content quality (Tier 2 filtering)
25
+ * Checks length, title quality, and text-to-HTML ratio
26
+ *
27
+ * @param extracted - Extracted content from article
28
+ * @returns Validation result with score and reasons
29
+ */
30
+ export declare function validateContent(extracted: ExtractedContent): ContentValidation;
31
+ /**
32
+ * Calculate article quality score (Tier 3 filtering)
33
+ *
34
+ * Score breakdown:
35
+ * - Content validation (60%): Length, title quality, text-to-HTML ratio
36
+ * - Publication date (12%): Articles should have timestamps
37
+ * - Author/byline (8%): Professional articles cite authors
38
+ * - Schema.org metadata (8%): Structured data indicates article pages
39
+ * - Reading time (12%): Substantial content (2+ min read)
40
+ *
41
+ * @param extracted - Extracted content from article
42
+ * @param config - Optional quality score configuration
43
+ * @returns Quality score between 0-1
44
+ */
45
+ export declare function calculateArticleQualityScore(extracted: ExtractedContent, config?: QualityScoreConfig): number;
46
+ /**
47
+ * Check if a path should be filtered out (non-US-English locale)
48
+ *
49
+ * Returns true (should filter) for:
50
+ * - /fr-be/, /de-de/, /ja-jp/, /zh-cn/ (non-English locales)
51
+ * - /en-gb/, /en-au/, /en-ca/ (non-US English locales)
52
+ *
53
+ * Returns false (should keep) for:
54
+ * - /en-us/ (US English only)
55
+ * - /blog/, /news/, /articles/ (no locale prefix - default to US English)
56
+ */
57
+ export declare function isNonEnglishLocalePath(path: string): boolean;
58
+ /**
59
+ * Check if a URL should be denied based on path patterns
60
+ *
61
+ * @param url - URL to check
62
+ * @param denyPaths - Patterns to deny (supports wildcards with *)
63
+ * @returns True if URL should be denied
64
+ */
65
+ export declare function shouldDenyUrl(url: string, denyPaths?: string[]): boolean;
66
+ /**
67
+ * Get quality score breakdown for debugging
68
+ * Useful for understanding why an article scored a certain way
69
+ *
70
+ * @param extracted - Extracted content from article
71
+ * @param config - Optional quality score configuration
72
+ * @returns Breakdown of quality score components
73
+ */
74
+ export declare function getQualityBreakdown(extracted: ExtractedContent, config?: QualityScoreConfig): {
75
+ contentValidation: number;
76
+ publishedDate: number;
77
+ author: number;
78
+ schema: number;
79
+ readingTime: number;
80
+ total: number;
81
+ passesThreshold: boolean;
82
+ };
83
+ //# sourceMappingURL=quality-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"quality-scorer.d.ts","sourceRoot":"","sources":["../../lib/quality-scorer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,MAAM,SAAS,CAAC;AAElF;;;GAGG;AACH,eAAO,MAAM,sBAAsB,EAAE,QAAQ,CAAC,kBAAkB,CAO/D,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,kBAAkB,UA+H9B,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,mBAAmB,UAc/B,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,eAAe,CAAC,SAAS,EAAE,gBAAgB,GAAG,iBAAiB,CAsC9E;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,4BAA4B,CAC1C,SAAS,EAAE,gBAAgB,EAC3B,MAAM,GAAE,kBAAuB,GAC9B,MAAM,CAiDR;AAYD;;;;;;;;;;GAUG;AACH,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAQ5D;AAED;;;;;;GAMG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,GAAE,MAAM,EAAuB,GAAG,OAAO,CAyB5F;AAED;;;;;;;GAOG;AACH,wBAAgB,mBAAmB,CACjC,SAAS,EAAE,gBAAgB,EAC3B,MAAM,GAAE,kBAAuB,GAC9B;IACD,iBAAiB,EAAE,MAAM,CAAC;IAC1B,aAAa,EAAE,MAAM,CAAC;IACtB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,eAAe,EAAE,OAAO,CAAC;CAC1B,CA8CA"}