@memvid/maw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +188 -0
  2. package/dist/bin/maw.d.ts +6 -0
  3. package/dist/bin/maw.d.ts.map +1 -0
  4. package/dist/bin/maw.js +275 -0
  5. package/dist/bin/maw.js.map +1 -0
  6. package/dist/src/crawler/index.d.ts +71 -0
  7. package/dist/src/crawler/index.d.ts.map +1 -0
  8. package/dist/src/crawler/index.js +249 -0
  9. package/dist/src/crawler/index.js.map +1 -0
  10. package/dist/src/crawler/robots.d.ts +26 -0
  11. package/dist/src/crawler/robots.d.ts.map +1 -0
  12. package/dist/src/crawler/robots.js +179 -0
  13. package/dist/src/crawler/robots.js.map +1 -0
  14. package/dist/src/crawler/sitemap.d.ts +36 -0
  15. package/dist/src/crawler/sitemap.d.ts.map +1 -0
  16. package/dist/src/crawler/sitemap.js +209 -0
  17. package/dist/src/crawler/sitemap.js.map +1 -0
  18. package/dist/src/engine/detector.d.ts +18 -0
  19. package/dist/src/engine/detector.d.ts.map +1 -0
  20. package/dist/src/engine/detector.js +155 -0
  21. package/dist/src/engine/detector.js.map +1 -0
  22. package/dist/src/engine/fetch.d.ts +18 -0
  23. package/dist/src/engine/fetch.d.ts.map +1 -0
  24. package/dist/src/engine/fetch.js +53 -0
  25. package/dist/src/engine/fetch.js.map +1 -0
  26. package/dist/src/engine/index.d.ts +39 -0
  27. package/dist/src/engine/index.d.ts.map +1 -0
  28. package/dist/src/engine/index.js +116 -0
  29. package/dist/src/engine/index.js.map +1 -0
  30. package/dist/src/engine/playwright.d.ts +23 -0
  31. package/dist/src/engine/playwright.d.ts.map +1 -0
  32. package/dist/src/engine/playwright.js +88 -0
  33. package/dist/src/engine/playwright.js.map +1 -0
  34. package/dist/src/engine/rebrowser.d.ts +22 -0
  35. package/dist/src/engine/rebrowser.d.ts.map +1 -0
  36. package/dist/src/engine/rebrowser.js +142 -0
  37. package/dist/src/engine/rebrowser.js.map +1 -0
  38. package/dist/src/extractor/cleaner.d.ts +13 -0
  39. package/dist/src/extractor/cleaner.d.ts.map +1 -0
  40. package/dist/src/extractor/cleaner.js +122 -0
  41. package/dist/src/extractor/cleaner.js.map +1 -0
  42. package/dist/src/extractor/index.d.ts +29 -0
  43. package/dist/src/extractor/index.d.ts.map +1 -0
  44. package/dist/src/extractor/index.js +162 -0
  45. package/dist/src/extractor/index.js.map +1 -0
  46. package/dist/src/extractor/links.d.ts +22 -0
  47. package/dist/src/extractor/links.d.ts.map +1 -0
  48. package/dist/src/extractor/links.js +92 -0
  49. package/dist/src/extractor/links.js.map +1 -0
  50. package/dist/src/extractor/markdown.d.ts +13 -0
  51. package/dist/src/extractor/markdown.d.ts.map +1 -0
  52. package/dist/src/extractor/markdown.js +94 -0
  53. package/dist/src/extractor/markdown.js.map +1 -0
  54. package/dist/src/git/index.d.ts +40 -0
  55. package/dist/src/git/index.d.ts.map +1 -0
  56. package/dist/src/git/index.js +303 -0
  57. package/dist/src/git/index.js.map +1 -0
  58. package/dist/src/index.d.ts +103 -0
  59. package/dist/src/index.d.ts.map +1 -0
  60. package/dist/src/index.js +229 -0
  61. package/dist/src/index.js.map +1 -0
  62. package/dist/src/ingestor/index.d.ts +95 -0
  63. package/dist/src/ingestor/index.d.ts.map +1 -0
  64. package/dist/src/ingestor/index.js +471 -0
  65. package/dist/src/ingestor/index.js.map +1 -0
  66. package/dist/src/utils/dedup.d.ts +66 -0
  67. package/dist/src/utils/dedup.d.ts.map +1 -0
  68. package/dist/src/utils/dedup.js +296 -0
  69. package/dist/src/utils/dedup.js.map +1 -0
  70. package/dist/src/utils/index.d.ts +3 -0
  71. package/dist/src/utils/index.d.ts.map +1 -0
  72. package/dist/src/utils/index.js +3 -0
  73. package/dist/src/utils/index.js.map +1 -0
  74. package/dist/src/utils/logger.d.ts +12 -0
  75. package/dist/src/utils/logger.d.ts.map +1 -0
  76. package/dist/src/utils/logger.js +49 -0
  77. package/dist/src/utils/logger.js.map +1 -0
  78. package/dist/src/utils/ui.d.ts +126 -0
  79. package/dist/src/utils/ui.d.ts.map +1 -0
  80. package/dist/src/utils/ui.js +357 -0
  81. package/dist/src/utils/ui.js.map +1 -0
  82. package/dist/src/utils/url.d.ts +21 -0
  83. package/dist/src/utils/url.d.ts.map +1 -0
  84. package/dist/src/utils/url.js +107 -0
  85. package/dist/src/utils/url.js.map +1 -0
  86. package/package.json +71 -0
@@ -0,0 +1,209 @@
1
+ /**
2
+ * Sitemap.xml parser using sitemapper
3
+ */
4
+ import * as cheerio from 'cheerio';
5
+ import Sitemapper from 'sitemapper';
6
+ export class SitemapParser {
7
+ cache = new Map();
8
+ sitemapper;
9
+ constructor() {
10
+ this.sitemapper = new Sitemapper({
11
+ timeout: 10000,
12
+ requestHeaders: {
13
+ 'User-Agent': 'maw/1.0 (sitemap crawler)',
14
+ },
15
+ });
16
+ }
17
+ /**
18
+ * Parse sitemap for a URL and return all URLs
19
+ */
20
+ async parse(url) {
21
+ try {
22
+ const parsedUrl = new URL(url);
23
+ const host = parsedUrl.hostname;
24
+ // Check cache
25
+ if (this.cache.has(host)) {
26
+ return this.cache.get(host);
27
+ }
28
+ const urls = [];
29
+ // Try common sitemap locations
30
+ const sitemapUrls = [
31
+ `${parsedUrl.origin}/sitemap.xml`,
32
+ `${parsedUrl.origin}/sitemap_index.xml`,
33
+ `${parsedUrl.origin}/sitemap/sitemap.xml`,
34
+ ];
35
+ for (const sitemapUrl of sitemapUrls) {
36
+ try {
37
+ const sitemapContent = await this.fetchSitemap(sitemapUrl);
38
+ if (sitemapContent) {
39
+ const parsed = await this.parseSitemapContent(sitemapContent, parsedUrl.origin);
40
+ urls.push(...parsed);
41
+ if (urls.length > 0)
42
+ break;
43
+ }
44
+ }
45
+ catch {
46
+ // Try next location
47
+ }
48
+ }
49
+ this.cache.set(host, urls);
50
+ return urls;
51
+ }
52
+ catch {
53
+ return [];
54
+ }
55
+ }
56
+ /**
57
+ * Parse sitemap from a specific URL
58
+ */
59
+ async parseUrl(sitemapUrl) {
60
+ try {
61
+ const content = await this.fetchSitemap(sitemapUrl);
62
+ if (!content)
63
+ return [];
64
+ const origin = new URL(sitemapUrl).origin;
65
+ return this.parseSitemapContent(content, origin);
66
+ }
67
+ catch {
68
+ return [];
69
+ }
70
+ }
71
+ async fetchSitemap(url) {
72
+ try {
73
+ const response = await fetch(url, {
74
+ headers: {
75
+ 'User-Agent': 'maw/1.0',
76
+ 'Accept': 'application/xml, text/xml, */*',
77
+ },
78
+ signal: AbortSignal.timeout(5000), // 5s timeout - fail fast
79
+ });
80
+ if (!response.ok)
81
+ return null;
82
+ return await response.text();
83
+ }
84
+ catch {
85
+ return null;
86
+ }
87
+ }
88
+ async parseSitemapContent(content, origin) {
89
+ const urls = [];
90
+ const MAX_URLS = 200; // Don't parse more than we need
91
+ // Check if it's a sitemap index
92
+ if (content.includes('<sitemapindex')) {
93
+ const indexUrls = this.parseSitemapIndex(content);
94
+ // Fetch only first 2 sub-sitemaps (speed over completeness)
95
+ for (const indexUrl of indexUrls.slice(0, 2)) {
96
+ if (urls.length >= MAX_URLS)
97
+ break; // Early exit
98
+ try {
99
+ const subContent = await this.fetchSitemap(indexUrl);
100
+ if (subContent) {
101
+ const subUrls = this.parseUrlset(subContent);
102
+ urls.push(...subUrls.slice(0, MAX_URLS - urls.length));
103
+ }
104
+ }
105
+ catch {
106
+ // Skip failed sitemaps
107
+ }
108
+ }
109
+ }
110
+ else {
111
+ // Regular sitemap - limit to MAX_URLS
112
+ const parsed = this.parseUrlset(content);
113
+ urls.push(...parsed.slice(0, MAX_URLS));
114
+ }
115
+ return urls.slice(0, MAX_URLS);
116
+ }
117
+ parseSitemapIndex(content) {
118
+ const $ = cheerio.load(content, { xmlMode: true });
119
+ const urls = [];
120
+ $('sitemap loc').each((_, el) => {
121
+ const loc = $(el).text().trim();
122
+ if (loc)
123
+ urls.push(loc);
124
+ });
125
+ return urls;
126
+ }
127
+ parseUrlset(content) {
128
+ const $ = cheerio.load(content, { xmlMode: true });
129
+ const urls = [];
130
+ $('url loc').each((_, el) => {
131
+ const loc = $(el).text().trim();
132
+ if (loc)
133
+ urls.push(loc);
134
+ });
135
+ return urls;
136
+ }
137
+ /**
138
+ * Parse sitemap for a URL and return all URLs with metadata
139
+ * Uses sitemapper for robust parsing of sitemap indexes and news sitemaps
140
+ */
141
+ async parseWithMetadata(url) {
142
+ try {
143
+ const parsedUrl = new URL(url);
144
+ const origin = parsedUrl.origin;
145
+ // Try news sitemaps first (more recent content)
146
+ const newsSitemapUrls = [
147
+ `${origin}/sitemap/news.xml`,
148
+ `${origin}/news-sitemap.xml`,
149
+ `${origin}/sitemap-news.xml`,
150
+ `${origin}/sitemaps/news.xml`,
151
+ ];
152
+ // Try news sitemaps first
153
+ for (const sitemapUrl of newsSitemapUrls) {
154
+ try {
155
+ const content = await this.fetchSitemap(sitemapUrl);
156
+ if (content && content.includes('<url')) {
157
+ const urls = this.parseWithMeta(content);
158
+ if (urls.length > 0) {
159
+ return urls.slice(0, 1000);
160
+ }
161
+ }
162
+ }
163
+ catch {
164
+ // Try next
165
+ }
166
+ }
167
+ // Fall back to sitemapper for regular sitemaps
168
+ const result = await this.sitemapper.fetch(`${origin}/sitemap.xml`);
169
+ if (result.sites && result.sites.length > 0) {
170
+ return result.sites.slice(0, 1000).map(loc => ({ loc }));
171
+ }
172
+ // Try sitemap index
173
+ const indexResult = await this.sitemapper.fetch(`${origin}/sitemap_index.xml`);
174
+ if (indexResult.sites && indexResult.sites.length > 0) {
175
+ return indexResult.sites.slice(0, 1000).map(loc => ({ loc }));
176
+ }
177
+ return [];
178
+ }
179
+ catch {
180
+ return [];
181
+ }
182
+ }
183
+ /**
184
+ * Parse sitemap with full metadata
185
+ */
186
+ parseWithMeta(content) {
187
+ const $ = cheerio.load(content, { xmlMode: true });
188
+ const urls = [];
189
+ $('url').each((_, el) => {
190
+ const $el = $(el);
191
+ const loc = $el.find('loc').text().trim();
192
+ if (loc) {
193
+ const entry = { loc };
194
+ const lastmod = $el.find('lastmod').text().trim();
195
+ if (lastmod)
196
+ entry.lastmod = lastmod;
197
+ const changefreq = $el.find('changefreq').text().trim();
198
+ if (changefreq)
199
+ entry.changefreq = changefreq;
200
+ const priority = $el.find('priority').text().trim();
201
+ if (priority)
202
+ entry.priority = parseFloat(priority);
203
+ urls.push(entry);
204
+ }
205
+ });
206
+ return urls;
207
+ }
208
+ }
209
+ //# sourceMappingURL=sitemap.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sitemap.js","sourceRoot":"","sources":["../../../src/crawler/sitemap.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AACnC,OAAO,UAAU,MAAM,YAAY,CAAC;AASpC,MAAM,OAAO,aAAa;IAChB,KAAK,GAA0B,IAAI,GAAG,EAAE,CAAC;IACzC,UAAU,CAAa;IAE/B;QACE,IAAI,CAAC,UAAU,GAAG,IAAI,UAAU,CAAC;YAC/B,OAAO,EAAE,KAAK;YACd,cAAc,EAAE;gBACd,YAAY,EAAE,2BAA2B;aAC1C;SACF,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,GAAW;QACrB,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC/B,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,CAAC;YAEhC,cAAc;YACd,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBACzB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC;YAC/B,CAAC;YAED,MAAM,IAAI,GAAa,EAAE,CAAC;YAE1B,+BAA+B;YAC/B,MAAM,WAAW,GAAG;gBAClB,GAAG,SAAS,CAAC,MAAM,cAAc;gBACjC,GAAG,SAAS,CAAC,MAAM,oBAAoB;gBACvC,GAAG,SAAS,CAAC,MAAM,sBAAsB;aAC1C,CAAC;YAEF,KAAK,MAAM,UAAU,IAAI,WAAW,EAAE,CAAC;gBACrC,IAAI,CAAC;oBACH,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;oBAC3D,IAAI,cAAc,EAAE,CAAC;wBACnB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,mBAAmB,CAAC,cAAc,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;wBAChF,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;wBACrB,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;4BAAE,MAAM;oBAC7B,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,oBAAoB;gBACtB,CAAC;YACH,CAAC;YAED,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;YAC3B,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,UAAkB;QAC/B,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;YACpD,IAAI,CAAC,OAAO;gBAAE,OAAO,EAAE,CAAC;YAExB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC;YAC1C,OAAO,IAAI,CAAC,mBAAmB,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;QACnD,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,GAAW;QACpC,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE;oBACP,YAAY,EAAE,SAAS;oBACvB,QAAQ,EAAE,gCAAgC;iBAC3C;gBACD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,yBAAyB;aAC7D,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE;gBAAE,OAAO,IAAI,CAAC;YAE9B,OAAO,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,mBAAmB,CAAC,OAAe,EAAE,MAAc;QAC/D,MAAM,IAAI,GAAa,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,GAAG,CAAC,CAAC,gCAAgC;QAEtD,gCAAgC;QAChC,IAAI,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,EAAE,CAAC;YACtC,MAAM,SAAS,GAAG,IAAI,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAElD,4DAA4D;YAC5D,KAAK,MAAM,QAAQ,IAAI,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;gBAC7C,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ;oBAAE,MAAM,CAAC,aAAa;gBACjD,IAAI,CAAC;oBACH,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;oBACrD,IAAI,UAAU,EAAE,CAAC;wBACf,MAAM,OAAO,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;wBAC7C,IAAI,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;oBACzD,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,uBAAuB;gBACzB,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,sCAAsC;YACtC,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;YACzC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC;QAC1C,CAAC;QAED,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IACjC,CAAC;IAEO,iBAAiB,CAAC,OAAe;QACvC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACnD,MAAM,IAAI,GAAa,EAAE,CAAC;QAE1B,CAAC,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YAC9B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAChC,IAAI,GAAG;gBAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,WAAW,CAAC,OAAe;QACjC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACnD,MAAM,IAAI,GAAa,EAAE,CAAC;QAE1B,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YAC1B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAChC,IAAI,GAAG;gBAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,iBAAiB,CAAC,GAAW;QACjC,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC/B,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC;YAEhC,gDAAgD;YAChD,MAAM,eAAe,GAAG;gBACtB,GAAG,MAAM,mBAAmB;gBAC5B,GAAG,MAAM,mBAAmB;gBAC5B,GAAG,MAAM,mBAAmB;gBAC5B,GAAG,MAAM,oBAAoB;aAC9B,CAAC;YAEF,0BAA0B;YAC1B,KAAK,MAAM,UAAU,IAAI,eAAe,EAAE,CAAC;gBACzC,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;oBACpD,IAAI,OAAO,IAAI,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;wBACxC,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;wBACzC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;4BACpB,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;wBAC7B,CAAC;oBACH,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,WAAW;gBACb,CAAC;YACH,CAAC;YAED,+CAA+C;YAC/C,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;YACpE,IAAI,MAAM,CAAC,KAAK,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC5C,OAAO,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;YAC3D,CAAC;YAED,oBAAoB;YACpB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,MAAM,oBAAoB,CAAC,CAAC;YAC/E,IAAI,WAAW,CAAC,KAAK,IAAI,WAAW,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtD,OAAO,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;YAChE,CAAC;YAED,OAAO,EAAE,CAAC;QACZ,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,OAAe;QAC3B,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACnD,MAAM,IAAI,GAAiB,EAAE,CAAC;QAE9B,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;YAClB,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAE1C,IAAI,GAAG,EAAE,CAAC;gBACR,MAAM,KAAK,GAAe,EAAE,GAAG,EAAE,CAAC;gBAElC,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBAClD,IAAI,OAAO;oBAAE,KAAK,CAAC,OAAO,GAAG,OAAO,CAAC;gBAErC,MAAM,UAAU,GAAG,GAAG,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBACxD,IAAI,UAAU;oBAAE,KAAK,CAAC,UAAU,GAAG,UAAU,CAAC;gBAE9C,MAAM,QAAQ,GAAG,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBACpD,IAAI,QAAQ;oBAAE,KAAK,CAAC,QAAQ,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;gBAEpD,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO,IAAI,CAAC;IACd,CAAC;CACF"}
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Block detection heuristics for anti-bot systems
3
+ */
4
+ export type BlockReason = 'cloudflare' | 'datadome' | 'captcha' | 'rate_limit' | 'forbidden' | 'empty_body' | 'javascript_required' | 'access_denied';
5
+ export interface BlockCheck {
6
+ blocked: boolean;
7
+ reason?: BlockReason;
8
+ confidence: number;
9
+ }
10
+ /**
11
+ * Check if the response indicates blocking
12
+ */
13
+ export declare function isBlocked(html: string, statusCode: number, url: string): BlockCheck;
14
+ /**
15
+ * Quick pre-check based on known protected domains
16
+ */
17
+ export declare function needsBrowser(url: string): boolean;
18
+ //# sourceMappingURL=detector.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detector.d.ts","sourceRoot":"","sources":["../../../src/engine/detector.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,MAAM,WAAW,GACnB,YAAY,GACZ,UAAU,GACV,SAAS,GACT,YAAY,GACZ,WAAW,GACX,YAAY,GACZ,qBAAqB,GACrB,eAAe,CAAC;AAEpB,MAAM,WAAW,UAAU;IACzB,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;CACpB;AA8DD;;GAEG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,UAAU,CAuEnF;AAgBD;;GAEG;AACH,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAoBjD"}
@@ -0,0 +1,155 @@
1
+ /**
2
+ * Block detection heuristics for anti-bot systems
3
+ */
4
+ // Cloudflare challenge patterns
5
+ const CLOUDFLARE_PATTERNS = [
6
+ /Checking your browser/i,
7
+ /cf-browser-verification/i,
8
+ /cloudflare/i,
9
+ /_cf_chl_opt/,
10
+ /challenge-platform/i,
11
+ /Just a moment\.\.\./i,
12
+ /ray id:/i,
13
+ /cf-turnstile/i,
14
+ ];
15
+ // DataDome patterns
16
+ const DATADOME_PATTERNS = [
17
+ /datadome/i,
18
+ /dd\.js/,
19
+ /captcha-delivery\.com/,
20
+ /geo\.captcha-delivery\.com/,
21
+ ];
22
+ // PerimeterX / HUMAN patterns
23
+ const PERIMETERX_PATTERNS = [
24
+ /perimeterx/i,
25
+ /px-captcha/i,
26
+ /_pxhd/,
27
+ /human challenge/i,
28
+ ];
29
+ // Akamai Bot Manager patterns
30
+ const AKAMAI_PATTERNS = [
31
+ /akamai/i,
32
+ /ak_bmsc/,
33
+ /_abck/,
34
+ ];
35
+ // Generic bot detection patterns
36
+ const BOT_DETECTION_PATTERNS = [
37
+ /access denied/i,
38
+ /please verify you are human/i,
39
+ /enable javascript/i,
40
+ /browser.*not supported/i,
41
+ /automated access/i,
42
+ /bot detected/i,
43
+ /please complete the security check/i,
44
+ /unusual traffic/i,
45
+ /blocked/i,
46
+ /forbidden/i,
47
+ /not allowed/i,
48
+ ];
49
+ // Captcha patterns
50
+ const CAPTCHA_PATTERNS = [
51
+ /recaptcha/i,
52
+ /hcaptcha/i,
53
+ /g-recaptcha/,
54
+ /h-captcha/,
55
+ /captcha/i,
56
+ /turnstile/i,
57
+ ];
58
+ /**
59
+ * Check if the response indicates blocking
60
+ */
61
+ export function isBlocked(html, statusCode, url) {
62
+ // Status code checks
63
+ if (statusCode === 403) {
64
+ return { blocked: true, reason: 'forbidden', confidence: 0.9 };
65
+ }
66
+ if (statusCode === 429) {
67
+ return { blocked: true, reason: 'rate_limit', confidence: 0.95 };
68
+ }
69
+ if (statusCode === 503) {
70
+ // Could be Cloudflare challenge or actual server error
71
+ if (CLOUDFLARE_PATTERNS.some(p => p.test(html))) {
72
+ return { blocked: true, reason: 'cloudflare', confidence: 0.95 };
73
+ }
74
+ }
75
+ // Empty or minimal body
76
+ if (!html || html.length < 500) {
77
+ // Some pages are legitimately small, check for tell-tale signs
78
+ if (html && (html.includes('challenge') || html.includes('captcha'))) {
79
+ return { blocked: true, reason: 'empty_body', confidence: 0.7 };
80
+ }
81
+ }
82
+ // Cloudflare
83
+ const cfMatches = CLOUDFLARE_PATTERNS.filter(p => p.test(html));
84
+ if (cfMatches.length >= 2) {
85
+ return { blocked: true, reason: 'cloudflare', confidence: 0.9 };
86
+ }
87
+ // DataDome - only flag if page is SHORT (actual block pages are small)
88
+ if (DATADOME_PATTERNS.filter(p => p.test(html)).length >= 2 && html.length < 50000) {
89
+ return { blocked: true, reason: 'datadome', confidence: 0.9 };
90
+ }
91
+ // PerimeterX - only flag if page is short
92
+ if (PERIMETERX_PATTERNS.some(p => p.test(html)) && html.length < 50000) {
93
+ return { blocked: true, reason: 'access_denied', confidence: 0.85 };
94
+ }
95
+ // Akamai - only flag if page is short
96
+ if (AKAMAI_PATTERNS.filter(p => p.test(html)).length >= 2 && html.length < 50000) {
97
+ return { blocked: true, reason: 'access_denied', confidence: 0.8 };
98
+ }
99
+ // Captcha - only flag if the page is SHORT and has captcha patterns
100
+ // Long pages with captcha are probably just forms, not block pages
101
+ const captchaMatches = CAPTCHA_PATTERNS.filter(p => p.test(html)).length;
102
+ if (captchaMatches >= 2 && html.length < 10000) {
103
+ // Page is short with multiple captcha patterns - likely a challenge page
104
+ return { blocked: true, reason: 'captcha', confidence: 0.85 };
105
+ }
106
+ // Generic bot detection - only if page is very short
107
+ if (BOT_DETECTION_PATTERNS.some(p => p.test(html)) && html.length < 3000) {
108
+ return { blocked: true, reason: 'access_denied', confidence: 0.7 };
109
+ }
110
+ // JavaScript required (no actual content)
111
+ if (html.includes('noscript') &&
112
+ (html.includes('enable javascript') || html.includes('JavaScript is required')) &&
113
+ !html.includes('<article') &&
114
+ !html.includes('<main') &&
115
+ html.length < 5000) {
116
+ return { blocked: true, reason: 'javascript_required', confidence: 0.6 };
117
+ }
118
+ return { blocked: false, confidence: 0 };
119
+ }
120
+ // Known protected domains that typically need browser
121
+ const KNOWN_PROTECTED_DOMAINS = new Set([
122
+ 'linkedin.com',
123
+ 'instagram.com',
124
+ 'facebook.com',
125
+ 'twitter.com',
126
+ 'x.com',
127
+ 'tiktok.com',
128
+ 'indeed.com',
129
+ 'glassdoor.com',
130
+ 'zillow.com',
131
+ 'yelp.com',
132
+ ]);
133
+ /**
134
+ * Quick pre-check based on known protected domains
135
+ */
136
+ export function needsBrowser(url) {
137
+ try {
138
+ const hostname = new URL(url).hostname.replace('www.', '');
139
+ // Check exact match
140
+ if (KNOWN_PROTECTED_DOMAINS.has(hostname)) {
141
+ return true;
142
+ }
143
+ // Check if subdomain of protected domain
144
+ for (const domain of KNOWN_PROTECTED_DOMAINS) {
145
+ if (hostname.endsWith(`.${domain}`)) {
146
+ return true;
147
+ }
148
+ }
149
+ return false;
150
+ }
151
+ catch {
152
+ return false;
153
+ }
154
+ }
155
+ //# sourceMappingURL=detector.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detector.js","sourceRoot":"","sources":["../../../src/engine/detector.ts"],"names":[],"mappings":"AAAA;;GAEG;AAkBH,gCAAgC;AAChC,MAAM,mBAAmB,GAAG;IAC1B,wBAAwB;IACxB,0BAA0B;IAC1B,aAAa;IACb,aAAa;IACb,qBAAqB;IACrB,sBAAsB;IACtB,UAAU;IACV,eAAe;CAChB,CAAC;AAEF,oBAAoB;AACpB,MAAM,iBAAiB,GAAG;IACxB,WAAW;IACX,QAAQ;IACR,uBAAuB;IACvB,4BAA4B;CAC7B,CAAC;AAEF,8BAA8B;AAC9B,MAAM,mBAAmB,GAAG;IAC1B,aAAa;IACb,aAAa;IACb,OAAO;IACP,kBAAkB;CACnB,CAAC;AAEF,8BAA8B;AAC9B,MAAM,eAAe,GAAG;IACtB,SAAS;IACT,SAAS;IACT,OAAO;CACR,CAAC;AAEF,iCAAiC;AACjC,MAAM,sBAAsB,GAAG;IAC7B,gBAAgB;IAChB,8BAA8B;IAC9B,oBAAoB;IACpB,yBAAyB;IACzB,mBAAmB;IACnB,eAAe;IACf,qCAAqC;IACrC,kBAAkB;IAClB,UAAU;IACV,YAAY;IACZ,cAAc;CACf,CAAC;AAEF,mBAAmB;AACnB,MAAM,gBAAgB,GAAG;IACvB,YAAY;IACZ,WAAW;IACX,aAAa;IACb,WAAW;IACX,UAAU;IACV,YAAY;CACb,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,UAAkB,EAAE,GAAW;IACrE,qBAAqB;IACrB,IAAI,UAAU,KAAK,GAAG,EAAE,CAAC;QACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC;IACjE,CAAC;IAED,IAAI,UAAU,KAAK,GAAG,EAAE,CAAC;QACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,YAAY,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IACnE,CAAC;IAED,IAAI,UAAU,KAAK,GAAG,EAAE,CAAC;QACvB,uDAAuD;QACvD,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;YAChD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,YAAY,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;QACnE,CAAC;IACH,CAAC;IAED,wBAAwB;IACxB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QAC/B,+DAA+D;QAC/D,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC;YACrE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,YAAY,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC;QAClE,CAAC;IACH,CAAC;IAED,aAAa;IACb,MAAM,SAAS,GAAG,mBAAmB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAChE,IAAI,SAAS,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QAC1B,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,YAAY,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC;IAClE,CAAC;IAED,uEAAuE;IACvE,IAAI,iBAAiB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,KAAK,EAAE,CAAC;QACnF,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC;IAChE,CAAC;IAED,0CAA0C;IAC1C,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,KAAK,EAAE,CAAC;QACvE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,eAAe,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IACtE,CAAC;IAED,sCAAsC;IACtC,IAAI,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,KAAK,EAAE,CAAC;QACjF,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,eAAe,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC;IACrE,CAAC;IAED,oEAAoE;IACpE,mEAAmE;IACnE,MAAM,cAAc,GAAG,gBAAgB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;IACzE,IAAI,cAAc,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,KAAK,EAAE,CAAC;QAC/C,yEAAyE;QACzE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC;IAChE,CAAC;IAED,qDAAqD;IACrD,IAAI,sBAAsB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,IAAI,EAAE,CAAC;QACzE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,eAAe,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC;IACrE,CAAC;IAED,0CAA0C;IAC1C,IACE,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC;QACzB,CAAC,IAAI,CAAC,QAAQ,CAAC,mBAAmB,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,wBAAwB,CAAC,CAAC;QAC/E,CAAC,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC;QAC1B,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC;QACvB,IAAI,CAAC,MAAM,GAAG,IAAI,EAClB,CAAC;QACD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,qBAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC;IAC3E,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;AAC3C,CAAC;AAED,sDAAsD;AACtD,MAAM,uBAAuB,GAAG,IAAI,GAAG,CAAC;IACtC,cAAc;IACd,eAAe;IACf,cAAc;IACd,aAAa;IACb,OAAO;IACP,YAAY;IACZ,YAAY;IACZ,eAAe;IACf,YAAY;IACZ,UAAU;CACX,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,GAAW;IACtC,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QAE3D,oBAAoB;QACpB,IAAI,uBAAuB,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,yCAAyC;QACzC,KAAK,MAAM,MAAM,IAAI,uBAAuB,EAAE,CAAC;YAC7C,IAAI,QAAQ,CAAC,QAAQ,CAAC,IAAI,MAAM,EAAE,CAAC,EAAE,CAAC;gBACpC,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC"}
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Fast HTTP fetch engine - default for most sites
3
+ */
4
+ export interface FetchOptions {
5
+ timeout?: number;
6
+ userAgent?: string;
7
+ headers?: Record<string, string>;
8
+ }
9
+ export interface FetchResult {
10
+ html: string;
11
+ statusCode: number;
12
+ headers: Record<string, string>;
13
+ finalUrl: string;
14
+ }
15
+ export declare class FetchEngine {
16
+ fetch(url: string, options?: FetchOptions): Promise<FetchResult>;
17
+ }
18
+ //# sourceMappingURL=fetch.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetch.d.ts","sourceRoot":"","sources":["../../../src/engine/fetch.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,YAAY;IAC3B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAClC;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,QAAQ,EAAE,MAAM,CAAC;CAClB;AAqBD,qBAAa,WAAW;IAChB,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;CAoC3E"}
@@ -0,0 +1,53 @@
1
+ /**
2
+ * Fast HTTP fetch engine - default for most sites
3
+ */
4
+ const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
5
+ const DEFAULT_HEADERS = {
6
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
7
+ 'Accept-Language': 'en-US,en;q=0.9',
8
+ 'Accept-Encoding': 'gzip, deflate, br',
9
+ 'Cache-Control': 'no-cache',
10
+ 'Pragma': 'no-cache',
11
+ 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
12
+ 'Sec-Ch-Ua-Mobile': '?0',
13
+ 'Sec-Ch-Ua-Platform': '"macOS"',
14
+ 'Sec-Fetch-Dest': 'document',
15
+ 'Sec-Fetch-Mode': 'navigate',
16
+ 'Sec-Fetch-Site': 'none',
17
+ 'Sec-Fetch-User': '?1',
18
+ 'Upgrade-Insecure-Requests': '1',
19
+ };
20
+ export class FetchEngine {
21
+ async fetch(url, options = {}) {
22
+ const controller = new AbortController();
23
+ const timeout = options.timeout || 5000;
24
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
25
+ try {
26
+ const response = await fetch(url, {
27
+ headers: {
28
+ 'User-Agent': options.userAgent || DEFAULT_USER_AGENT,
29
+ ...DEFAULT_HEADERS,
30
+ ...options.headers,
31
+ },
32
+ redirect: 'follow',
33
+ signal: controller.signal,
34
+ });
35
+ clearTimeout(timeoutId);
36
+ const html = await response.text();
37
+ return {
38
+ html,
39
+ statusCode: response.status,
40
+ headers: Object.fromEntries(response.headers.entries()),
41
+ finalUrl: response.url,
42
+ };
43
+ }
44
+ catch (error) {
45
+ clearTimeout(timeoutId);
46
+ if (error.name === 'AbortError') {
47
+ throw new Error(`Timeout after ${timeout}ms`);
48
+ }
49
+ throw error;
50
+ }
51
+ }
52
+ }
53
+ //# sourceMappingURL=fetch.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetch.js","sourceRoot":"","sources":["../../../src/engine/fetch.ts"],"names":[],"mappings":"AAAA;;GAEG;AAeH,MAAM,kBAAkB,GACtB,uHAAuH,CAAC;AAE1H,MAAM,eAAe,GAAG;IACtB,QAAQ,EAAE,kGAAkG;IAC5G,iBAAiB,EAAE,gBAAgB;IACnC,iBAAiB,EAAE,mBAAmB;IACtC,eAAe,EAAE,UAAU;IAC3B,QAAQ,EAAE,UAAU;IACpB,WAAW,EAAE,kEAAkE;IAC/E,kBAAkB,EAAE,IAAI;IACxB,oBAAoB,EAAE,SAAS;IAC/B,gBAAgB,EAAE,UAAU;IAC5B,gBAAgB,EAAE,UAAU;IAC5B,gBAAgB,EAAE,MAAM;IACxB,gBAAgB,EAAE,IAAI;IACtB,2BAA2B,EAAE,GAAG;CACjC,CAAC;AAEF,MAAM,OAAO,WAAW;IACtB,KAAK,CAAC,KAAK,CAAC,GAAW,EAAE,UAAwB,EAAE;QACjD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC;QACxC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,OAAO,CAAC,CAAC;QAEhE,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE;oBACP,YAAY,EAAE,OAAO,CAAC,SAAS,IAAI,kBAAkB;oBACrD,GAAG,eAAe;oBAClB,GAAG,OAAO,CAAC,OAAO;iBACnB;gBACD,QAAQ,EAAE,QAAQ;gBAClB,MAAM,EAAE,UAAU,CAAC,MAAM;aAC1B,CAAC,CAAC;YAEH,YAAY,CAAC,SAAS,CAAC,CAAC;YAExB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,OAAO;gBACL,IAAI;gBACJ,UAAU,EAAE,QAAQ,CAAC,MAAM;gBAC3B,OAAO,EAAE,MAAM,CAAC,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;gBACvD,QAAQ,EAAE,QAAQ,CAAC,GAAG;aACvB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,YAAY,CAAC,SAAS,CAAC,CAAC;YAExB,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAChC,MAAM,IAAI,KAAK,CAAC,iBAAiB,OAAO,IAAI,CAAC,CAAC;YAChD,CAAC;YAED,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Engine Waterfall - orchestrates fetch engines
3
+ * Tries fast fetch first, escalates to browser only when blocked
4
+ */
5
+ import { type BlockReason } from './detector.js';
6
+ export interface EngineResult {
7
+ html: string;
8
+ statusCode: number;
9
+ engine: 'fetch' | 'playwright' | 'rebrowser';
10
+ blocked: boolean;
11
+ blockReason?: BlockReason;
12
+ finalUrl: string;
13
+ }
14
+ export interface EngineOptions {
15
+ timeout?: number;
16
+ userAgent?: string;
17
+ headers?: Record<string, string>;
18
+ forceEngine?: 'fetch' | 'playwright' | 'rebrowser';
19
+ }
20
+ export interface EngineStats {
21
+ fetch: number;
22
+ playwright: number;
23
+ rebrowser: number;
24
+ blocked: number;
25
+ }
26
+ export declare class EngineWaterfall {
27
+ private engines;
28
+ private stats;
29
+ private browserInstallPromise;
30
+ fetch(url: string, options?: EngineOptions): Promise<EngineResult>;
31
+ private fetchWithEngine;
32
+ private ensureBrowserInstalled;
33
+ private installBrowser;
34
+ getStats(): EngineStats;
35
+ close(): Promise<void>;
36
+ }
37
+ export { isBlocked, needsBrowser } from './detector.js';
38
+ export type { BlockReason, BlockCheck } from './detector.js';
39
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/engine/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAKH,OAAO,EAA2B,KAAK,WAAW,EAAE,MAAM,eAAe,CAAC;AAG1E,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,OAAO,GAAG,YAAY,GAAG,WAAW,CAAC;IAC7C,OAAO,EAAE,OAAO,CAAC;IACjB,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,aAAa;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,WAAW,CAAC,EAAE,OAAO,GAAG,YAAY,GAAG,WAAW,CAAC;CACpD;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;CACjB;AAmBD,qBAAa,eAAe;IAC1B,OAAO,CAAC,OAAO,CAA4E;IAC3F,OAAO,CAAC,KAAK,CAAsE;IACnF,OAAO,CAAC,qBAAqB,CAAiC;IAExD,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,CAAC;YAmC9D,eAAe;YAqCf,sBAAsB;YAatB,cAAc;IAc5B,QAAQ,IAAI,WAAW;IAIjB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAQ7B;AAED,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AACxD,YAAY,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC"}
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Engine Waterfall - orchestrates fetch engines
3
+ * Tries fast fetch first, escalates to browser only when blocked
4
+ */
5
+ import { FetchEngine } from './fetch.js';
6
+ import { PlaywrightEngine, isPlaywrightInstalled } from './playwright.js';
7
+ import { RebrowserEngine } from './rebrowser.js';
8
+ import { isBlocked, needsBrowser } from './detector.js';
9
+ import { createLogger } from '../utils/logger.js';
10
+ const log = createLogger();
11
+ // Engine priority order
12
+ const ENGINES = [
13
+ { name: 'fetch', timeout: 5000, create: () => new FetchEngine() },
14
+ { name: 'playwright', timeout: 15000, create: () => new PlaywrightEngine() },
15
+ { name: 'rebrowser', timeout: 30000, create: () => new RebrowserEngine() },
16
+ ];
17
+ export class EngineWaterfall {
18
+ engines = new Map();
19
+ stats = { fetch: 0, playwright: 0, rebrowser: 0, blocked: 0 };
20
+ browserInstallPromise = null;
21
+ async fetch(url, options = {}) {
22
+ // Force specific engine if requested
23
+ if (options.forceEngine) {
24
+ return this.fetchWithEngine(url, options.forceEngine, options);
25
+ }
26
+ // Skip fetch for known protected sites
27
+ const startIndex = needsBrowser(url) ? 1 : 0;
28
+ const enginesToTry = ENGINES.slice(startIndex);
29
+ // Try engines in order until success
30
+ for (const engineConfig of enginesToTry) {
31
+ try {
32
+ const result = await this.fetchWithEngine(url, engineConfig.name, {
33
+ ...options,
34
+ timeout: options.timeout || engineConfig.timeout,
35
+ });
36
+ // Check if response looks blocked
37
+ if (!result.blocked) {
38
+ this.stats[engineConfig.name]++;
39
+ return result;
40
+ }
41
+ log.dim(` ${engineConfig.name} blocked: ${result.blockReason}`);
42
+ }
43
+ catch (error) {
44
+ log.dim(` ${engineConfig.name} failed: ${error.message}`);
45
+ }
46
+ }
47
+ // All engines failed
48
+ this.stats.blocked++;
49
+ throw new Error(`All engines failed for ${url}`);
50
+ }
51
+ async fetchWithEngine(url, engineName, options) {
52
+ // Lazy-load browser engines
53
+ if (engineName === 'playwright' || engineName === 'rebrowser') {
54
+ if (!await this.ensureBrowserInstalled()) {
55
+ throw new Error('Browser not available');
56
+ }
57
+ }
58
+ // Get or create engine instance
59
+ let engine = this.engines.get(engineName);
60
+ if (!engine) {
61
+ const config = ENGINES.find(e => e.name === engineName);
62
+ if (!config)
63
+ throw new Error(`Unknown engine: ${engineName}`);
64
+ engine = config.create();
65
+ this.engines.set(engineName, engine);
66
+ }
67
+ // Fetch
68
+ const result = await engine.fetch(url, options);
69
+ // Detect blocking
70
+ const blockCheck = isBlocked(result.html, result.statusCode, url);
71
+ return {
72
+ html: result.html,
73
+ statusCode: result.statusCode,
74
+ engine: engineName,
75
+ blocked: blockCheck.blocked,
76
+ blockReason: blockCheck.reason,
77
+ finalUrl: result.finalUrl,
78
+ };
79
+ }
80
+ async ensureBrowserInstalled() {
81
+ if (await isPlaywrightInstalled()) {
82
+ return true;
83
+ }
84
+ // Only show install prompt once
85
+ if (!this.browserInstallPromise) {
86
+ this.browserInstallPromise = this.installBrowser();
87
+ }
88
+ return this.browserInstallPromise;
89
+ }
90
+ async installBrowser() {
91
+ log.info('\n Protected site detected. Installing browser...');
92
+ try {
93
+ const { execSync } = await import('child_process');
94
+ execSync('npx playwright install chromium', { stdio: 'inherit' });
95
+ log.success(' Browser installed successfully.\n');
96
+ return true;
97
+ }
98
+ catch (error) {
99
+ log.error(' Failed to install browser. Some sites may not work.');
100
+ return false;
101
+ }
102
+ }
103
+ getStats() {
104
+ return { ...this.stats };
105
+ }
106
+ async close() {
107
+ for (const engine of this.engines.values()) {
108
+ if ('close' in engine && typeof engine.close === 'function') {
109
+ await engine.close();
110
+ }
111
+ }
112
+ this.engines.clear();
113
+ }
114
+ }
115
+ export { isBlocked, needsBrowser } from './detector.js';
116
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/engine/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,MAAM,iBAAiB,CAAC;AAC1E,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,SAAS,EAAE,YAAY,EAAoB,MAAM,eAAe,CAAC;AAC1E,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAyBlD,MAAM,GAAG,GAAG,YAAY,EAAE,CAAC;AAU3B,wBAAwB;AACxB,MAAM,OAAO,GAAmB;IAC9B,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,IAAI,WAAW,EAAE,EAAE;IACjE,EAAE,IAAI,EAAE,YAAY,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,IAAI,gBAAgB,EAAE,EAAE;IAC5E,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,IAAI,eAAe,EAAE,EAAE;CAC3E,CAAC;AAEF,MAAM,OAAO,eAAe;IAClB,OAAO,GAAkE,IAAI,GAAG,EAAE,CAAC;IACnF,KAAK,GAAgB,EAAE,KAAK,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;IAC3E,qBAAqB,GAA4B,IAAI,CAAC;IAE9D,KAAK,CAAC,KAAK,CAAC,GAAW,EAAE,UAAyB,EAAE;QAClD,qCAAqC;QACrC,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;YACxB,OAAO,IAAI,CAAC,eAAe,CAAC,GAAG,EAAE,OAAO,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;QACjE,CAAC;QAED,uCAAuC;QACvC,MAAM,UAAU,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7C,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;QAE/C,qCAAqC;QACrC,KAAK,MAAM,YAAY,IAAI,YAAY,EAAE,CAAC;YACxC,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,GAAG,EAAE,YAAY,CAAC,IAAI,EAAE;oBAChE,GAAG,OAAO;oBACV,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,YAAY,CAAC,OAAO;iBACjD,CAAC,CAAC;gBAEH,kCAAkC;gBAClC,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;oBACpB,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,EAAE,CAAC;oBAChC,OAAO,MAAM,CAAC;gBAChB,CAAC;gBAED,GAAG,CAAC,GAAG,CAAC,KAAK,YAAY,CAAC,IAAI,aAAa,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC;YACnE,CAAC;YAAC,OAAO,KAAU,EAAE,CAAC;gBACpB,GAAG,CAAC,GAAG,CAAC,KAAK,YAAY,CAAC,IAAI,YAAY,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;QAED,qBAAqB;QACrB,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;QACrB,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;IACnD,CAAC;IAEO,KAAK,CAAC,eAAe,CAC3B,GAAW,EACX,UAAsB,EACtB,OAAsB;QAEtB,4BAA4B;QAC5B,IAAI,UAAU,KAAK,YAAY,IAAI,UAAU,KAAK,WAAW,EAAE,CAAC;YAC9D,IAAI,CAAC,MAAM,IAAI,CAAC,sBAAsB,EAAE,EAAE,CAAC;gBACzC,MAAM,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC;YAC3C,CAAC;QACH,CAAC;QAED,gCAAgC;QAChC,IAAI,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,UAAU,CAAC,CAAC;YACxD,IAAI,CAAC,MAAM;gBAAE,MAAM,IAAI,KAAK,CAAC,mBAAmB,UAAU,EAAE,CAAC,CAAC;YAC9D,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC;YACzB,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;QACvC,CAAC;QAED,QAAQ;QACR,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QAEhD,kBAAkB;QAClB,MAAM,UAAU,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC;QAElE,OAAO;YACL,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,MAAM,EAAE,UAAU;YAClB,OAAO,EAAE,UAAU,CAAC,OAAO;YAC3B,WAAW,EAAE,UAAU,CAAC,MAAM;YAC9B,QAAQ,EAAE,MAAM,CAAC,QAAQ;SAC1B,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,sBAAsB;QAClC,IAAI,MAAM,qBAAqB,EAAE,EAAE,CAAC;YAClC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,gCAAgC;QAChC,IAAI,CAAC,IAAI,CAAC,qBAAqB,EAAE,CAAC;YAChC,IAAI,CAAC,qBAAqB,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;QACrD,CAAC;QAED,OAAO,IAAI,CAAC,qBAAqB,CAAC;IACpC,CAAC;IAEO,KAAK,CAAC,cAAc;QAC1B,GAAG,CAAC,IAAI,CAAC,oDAAoD,CAAC,CAAC;QAE/D,IAAI,CAAC;YACH,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACnD,QAAQ,CAAC,iCAAiC,EAAE,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC,CAAC;YAClE,GAAG,CAAC,OAAO,CAAC,qCAAqC,CAAC,CAAC;YACnD,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,GAAG,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;YACnE,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,QAAQ;QACN,OAAO,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;IAC3B,CAAC;IAED,KAAK,CAAC,KAAK;QACT,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;YAC3C,IAAI,OAAO,IAAI,MAAM,IAAI,OAAO,MAAM,CAAC,KAAK,KAAK,UAAU,EAAE,CAAC;gBAC5D,MAAM,MAAM,CAAC,KAAK,EAAE,CAAC;YACvB,CAAC;QACH,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;IACvB,CAAC;CACF;AAED,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC"}