@memvid/maw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +188 -0
  2. package/dist/bin/maw.d.ts +6 -0
  3. package/dist/bin/maw.d.ts.map +1 -0
  4. package/dist/bin/maw.js +275 -0
  5. package/dist/bin/maw.js.map +1 -0
  6. package/dist/src/crawler/index.d.ts +71 -0
  7. package/dist/src/crawler/index.d.ts.map +1 -0
  8. package/dist/src/crawler/index.js +249 -0
  9. package/dist/src/crawler/index.js.map +1 -0
  10. package/dist/src/crawler/robots.d.ts +26 -0
  11. package/dist/src/crawler/robots.d.ts.map +1 -0
  12. package/dist/src/crawler/robots.js +179 -0
  13. package/dist/src/crawler/robots.js.map +1 -0
  14. package/dist/src/crawler/sitemap.d.ts +36 -0
  15. package/dist/src/crawler/sitemap.d.ts.map +1 -0
  16. package/dist/src/crawler/sitemap.js +209 -0
  17. package/dist/src/crawler/sitemap.js.map +1 -0
  18. package/dist/src/engine/detector.d.ts +18 -0
  19. package/dist/src/engine/detector.d.ts.map +1 -0
  20. package/dist/src/engine/detector.js +155 -0
  21. package/dist/src/engine/detector.js.map +1 -0
  22. package/dist/src/engine/fetch.d.ts +18 -0
  23. package/dist/src/engine/fetch.d.ts.map +1 -0
  24. package/dist/src/engine/fetch.js +53 -0
  25. package/dist/src/engine/fetch.js.map +1 -0
  26. package/dist/src/engine/index.d.ts +39 -0
  27. package/dist/src/engine/index.d.ts.map +1 -0
  28. package/dist/src/engine/index.js +116 -0
  29. package/dist/src/engine/index.js.map +1 -0
  30. package/dist/src/engine/playwright.d.ts +23 -0
  31. package/dist/src/engine/playwright.d.ts.map +1 -0
  32. package/dist/src/engine/playwright.js +88 -0
  33. package/dist/src/engine/playwright.js.map +1 -0
  34. package/dist/src/engine/rebrowser.d.ts +22 -0
  35. package/dist/src/engine/rebrowser.d.ts.map +1 -0
  36. package/dist/src/engine/rebrowser.js +142 -0
  37. package/dist/src/engine/rebrowser.js.map +1 -0
  38. package/dist/src/extractor/cleaner.d.ts +13 -0
  39. package/dist/src/extractor/cleaner.d.ts.map +1 -0
  40. package/dist/src/extractor/cleaner.js +122 -0
  41. package/dist/src/extractor/cleaner.js.map +1 -0
  42. package/dist/src/extractor/index.d.ts +29 -0
  43. package/dist/src/extractor/index.d.ts.map +1 -0
  44. package/dist/src/extractor/index.js +162 -0
  45. package/dist/src/extractor/index.js.map +1 -0
  46. package/dist/src/extractor/links.d.ts +22 -0
  47. package/dist/src/extractor/links.d.ts.map +1 -0
  48. package/dist/src/extractor/links.js +92 -0
  49. package/dist/src/extractor/links.js.map +1 -0
  50. package/dist/src/extractor/markdown.d.ts +13 -0
  51. package/dist/src/extractor/markdown.d.ts.map +1 -0
  52. package/dist/src/extractor/markdown.js +94 -0
  53. package/dist/src/extractor/markdown.js.map +1 -0
  54. package/dist/src/git/index.d.ts +40 -0
  55. package/dist/src/git/index.d.ts.map +1 -0
  56. package/dist/src/git/index.js +303 -0
  57. package/dist/src/git/index.js.map +1 -0
  58. package/dist/src/index.d.ts +103 -0
  59. package/dist/src/index.d.ts.map +1 -0
  60. package/dist/src/index.js +229 -0
  61. package/dist/src/index.js.map +1 -0
  62. package/dist/src/ingestor/index.d.ts +95 -0
  63. package/dist/src/ingestor/index.d.ts.map +1 -0
  64. package/dist/src/ingestor/index.js +471 -0
  65. package/dist/src/ingestor/index.js.map +1 -0
  66. package/dist/src/utils/dedup.d.ts +66 -0
  67. package/dist/src/utils/dedup.d.ts.map +1 -0
  68. package/dist/src/utils/dedup.js +296 -0
  69. package/dist/src/utils/dedup.js.map +1 -0
  70. package/dist/src/utils/index.d.ts +3 -0
  71. package/dist/src/utils/index.d.ts.map +1 -0
  72. package/dist/src/utils/index.js +3 -0
  73. package/dist/src/utils/index.js.map +1 -0
  74. package/dist/src/utils/logger.d.ts +12 -0
  75. package/dist/src/utils/logger.d.ts.map +1 -0
  76. package/dist/src/utils/logger.js +49 -0
  77. package/dist/src/utils/logger.js.map +1 -0
  78. package/dist/src/utils/ui.d.ts +126 -0
  79. package/dist/src/utils/ui.d.ts.map +1 -0
  80. package/dist/src/utils/ui.js +357 -0
  81. package/dist/src/utils/ui.js.map +1 -0
  82. package/dist/src/utils/url.d.ts +21 -0
  83. package/dist/src/utils/url.d.ts.map +1 -0
  84. package/dist/src/utils/url.js +107 -0
  85. package/dist/src/utils/url.js.map +1 -0
  86. package/package.json +71 -0
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Playwright engine for JavaScript rendering
3
+ * Lazy-loaded only when needed
4
+ */
5
+ /**
6
+ * Check if playwright is installed
7
+ */
8
+ export declare function isPlaywrightInstalled(): Promise<boolean>;
9
+ export interface PlaywrightOptions {
10
+ timeout?: number;
11
+ userAgent?: string;
12
+ }
13
+ export declare class PlaywrightEngine {
14
+ private browser;
15
+ private context;
16
+ fetch(url: string, options?: PlaywrightOptions): Promise<{
17
+ html: string;
18
+ statusCode: number;
19
+ finalUrl: string;
20
+ }>;
21
+ close(): Promise<void>;
22
+ }
23
+ //# sourceMappingURL=playwright.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"playwright.d.ts","sourceRoot":"","sources":["../../../src/engine/playwright.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAMH;;GAEG;AACH,wBAAsB,qBAAqB,IAAI,OAAO,CAAC,OAAO,CAAC,CAO9D;AAED,MAAM,WAAW,iBAAiB;IAChC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,OAAO,CAA+B;IAExC,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,iBAAsB,GAAG,OAAO,CAAC;QACjE,IAAI,EAAE,MAAM,CAAC;QACb,UAAU,EAAE,MAAM,CAAC;QACnB,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC;IA+DI,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAU7B"}
@@ -0,0 +1,88 @@
1
+ /**
2
+ * Playwright engine for JavaScript rendering
3
+ * Lazy-loaded only when needed
4
+ */
5
+ let playwrightModule = null;
6
+ /**
7
+ * Check if playwright is installed
8
+ */
9
+ export async function isPlaywrightInstalled() {
10
+ try {
11
+ playwrightModule = await import('playwright');
12
+ return true;
13
+ }
14
+ catch {
15
+ return false;
16
+ }
17
+ }
18
+ export class PlaywrightEngine {
19
+ browser = null;
20
+ context = null;
21
+ async fetch(url, options = {}) {
22
+ if (!playwrightModule) {
23
+ playwrightModule = await import('playwright');
24
+ }
25
+ if (!this.browser) {
26
+ this.browser = await playwrightModule.chromium.launch({
27
+ headless: true,
28
+ args: [
29
+ '--disable-blink-features=AutomationControlled',
30
+ '--disable-dev-shm-usage',
31
+ '--no-sandbox',
32
+ '--disable-setuid-sandbox',
33
+ '--disable-infobars',
34
+ '--window-position=0,0',
35
+ '--ignore-certifcate-errors',
36
+ '--ignore-certifcate-errors-spki-list',
37
+ ],
38
+ });
39
+ this.context = await this.browser.newContext({
40
+ userAgent: options.userAgent ||
41
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
42
+ viewport: { width: 1920, height: 1080 },
43
+ locale: 'en-US',
44
+ timezoneId: 'America/New_York',
45
+ deviceScaleFactor: 1,
46
+ hasTouch: false,
47
+ isMobile: false,
48
+ });
49
+ // Block unnecessary resources for speed
50
+ await this.context.route('**/*', (route) => {
51
+ const type = route.request().resourceType();
52
+ if (['image', 'stylesheet', 'font', 'media'].includes(type)) {
53
+ route.abort();
54
+ }
55
+ else {
56
+ route.continue();
57
+ }
58
+ });
59
+ }
60
+ const page = await this.context.newPage();
61
+ try {
62
+ const response = await page.goto(url, {
63
+ waitUntil: 'networkidle',
64
+ timeout: options.timeout || 15000,
65
+ });
66
+ // Wait a bit for dynamic content
67
+ await page.waitForTimeout(1000);
68
+ const html = await page.content();
69
+ const statusCode = response?.status() || 200;
70
+ const finalUrl = page.url();
71
+ return { html, statusCode, finalUrl };
72
+ }
73
+ finally {
74
+ await page.close();
75
+ }
76
+ }
77
+ async close() {
78
+ if (this.context) {
79
+ await this.context.close();
80
+ this.context = null;
81
+ }
82
+ if (this.browser) {
83
+ await this.browser.close();
84
+ this.browser = null;
85
+ }
86
+ }
87
+ }
88
+ //# sourceMappingURL=playwright.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"playwright.js","sourceRoot":"","sources":["../../../src/engine/playwright.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,IAAI,gBAAgB,GAAuC,IAAI,CAAC;AAEhE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB;IACzC,IAAI,CAAC;QACH,gBAAgB,GAAG,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;QAC9C,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAOD,MAAM,OAAO,gBAAgB;IACnB,OAAO,GAAmB,IAAI,CAAC;IAC/B,OAAO,GAA0B,IAAI,CAAC;IAE9C,KAAK,CAAC,KAAK,CAAC,GAAW,EAAE,UAA6B,EAAE;QAKtD,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACtB,gBAAgB,GAAG,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;QAChD,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,IAAI,CAAC,OAAO,GAAG,MAAM,gBAAgB,CAAC,QAAQ,CAAC,MAAM,CAAC;gBACpD,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE;oBACJ,+CAA+C;oBAC/C,yBAAyB;oBACzB,cAAc;oBACd,0BAA0B;oBAC1B,oBAAoB;oBACpB,uBAAuB;oBACvB,4BAA4B;oBAC5B,sCAAsC;iBACvC;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC;gBAC3C,SAAS,EAAE,OAAO,CAAC,SAAS;oBAC1B,uHAAuH;gBACzH,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE;gBACvC,MAAM,EAAE,OAAO;gBACf,UAAU,EAAE,kBAAkB;gBAC9B,iBAAiB,EAAE,CAAC;gBACpB,QAAQ,EAAE,KAAK;gBACf,QAAQ,EAAE,KAAK;aAChB,CAAC,CAAC;YAEH,wCAAwC;YACxC,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;gBACzC,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;gBAC5C,IAAI,CAAC,OAAO,EAAE,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC5D,KAAK,CAAC,KAAK,EAAE,CAAC;gBAChB,CAAC;qBAAM,CAAC;oBACN,KAAK,CAAC,QAAQ,EAAE,CAAC;gBACnB,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAQ,CAAC,OAAO,EAAE,CAAC;QAE3C,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACpC,SAAS,EAAE,aAAa;gBACxB,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;aAClC,CAAC,CAAC;YAEH,iCAAiC;YACjC,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAEhC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAClC,MAAM,UAAU,GAAG,QAAQ,EAAE,MAAM,EAAE,IAAI,GAAG,CAAC;YAC7C,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAE5B,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,CAAC;QACxC,CAAC;gBAAS,CAAC;YACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;QACrB,CAAC;IACH,CAAC;IAED,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Stealth engine for anti-bot bypass
3
+ * Uses playwright with stealth patches and human-like behavior
4
+ * Lazy-loaded only when needed
5
+ */
6
+ export interface RebrowserOptions {
7
+ timeout?: number;
8
+ userAgent?: string;
9
+ }
10
+ export declare class RebrowserEngine {
11
+ private browser;
12
+ private context;
13
+ fetch(url: string, options?: RebrowserOptions): Promise<{
14
+ html: string;
15
+ statusCode: number;
16
+ finalUrl: string;
17
+ }>;
18
+ private humanize;
19
+ private waitForChallenge;
20
+ close(): Promise<void>;
21
+ }
22
+ //# sourceMappingURL=rebrowser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rebrowser.d.ts","sourceRoot":"","sources":["../../../src/engine/rebrowser.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAMH,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,qBAAa,eAAe;IAC1B,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,OAAO,CAA+B;IAExC,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,gBAAqB,GAAG,OAAO,CAAC;QAChE,IAAI,EAAE,MAAM,CAAC;QACb,UAAU,EAAE,MAAM,CAAC;QACnB,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC;YAsGY,QAAQ;YAWR,gBAAgB;IA4BxB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAU7B"}
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Stealth engine for anti-bot bypass
3
+ * Uses playwright with stealth patches and human-like behavior
4
+ * Lazy-loaded only when needed
5
+ */
6
+ let playwrightModule = null;
7
+ export class RebrowserEngine {
8
+ browser = null;
9
+ context = null;
10
+ async fetch(url, options = {}) {
11
+ if (!playwrightModule) {
12
+ playwrightModule = await import('playwright');
13
+ }
14
+ if (!this.browser) {
15
+ // Use full browser with stealth settings
16
+ this.browser = await playwrightModule.chromium.launch({
17
+ headless: true,
18
+ args: [
19
+ '--disable-blink-features=AutomationControlled',
20
+ '--disable-dev-shm-usage',
21
+ '--no-sandbox',
22
+ '--disable-setuid-sandbox',
23
+ '--disable-web-security',
24
+ '--disable-features=IsolateOrigins,site-per-process',
25
+ '--disable-infobars',
26
+ '--window-position=0,0',
27
+ '--ignore-certifcate-errors',
28
+ ],
29
+ });
30
+ this.context = await this.browser.newContext({
31
+ userAgent: options.userAgent ||
32
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
33
+ viewport: { width: 1920, height: 1080 },
34
+ locale: 'en-US',
35
+ timezoneId: 'America/New_York',
36
+ geolocation: { latitude: 40.7128, longitude: -74.0060 },
37
+ permissions: ['geolocation'],
38
+ deviceScaleFactor: 1,
39
+ hasTouch: false,
40
+ isMobile: false,
41
+ });
42
+ // Add stealth scripts
43
+ await this.context.addInitScript(() => {
44
+ // Hide webdriver
45
+ Object.defineProperty(navigator, 'webdriver', { get: () => false });
46
+ // Fake plugins
47
+ Object.defineProperty(navigator, 'plugins', {
48
+ get: () => [1, 2, 3, 4, 5],
49
+ });
50
+ // Fake languages
51
+ Object.defineProperty(navigator, 'languages', {
52
+ get: () => ['en-US', 'en'],
53
+ });
54
+ // Override permissions
55
+ const originalQuery = window.navigator.permissions.query;
56
+ // @ts-ignore
57
+ window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
58
+ ? Promise.resolve({ state: Notification.permission })
59
+ : originalQuery(parameters);
60
+ // Hide automation indicators
61
+ // @ts-ignore
62
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Array;
63
+ // @ts-ignore
64
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Promise;
65
+ // @ts-ignore
66
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
67
+ });
68
+ // Block heavy resources
69
+ await this.context.route('**/*', (route) => {
70
+ const type = route.request().resourceType();
71
+ if (['image', 'stylesheet', 'font', 'media'].includes(type)) {
72
+ route.abort();
73
+ }
74
+ else {
75
+ route.continue();
76
+ }
77
+ });
78
+ }
79
+ const page = await this.context.newPage();
80
+ try {
81
+ // Simulate human-like behavior
82
+ await this.humanize(page);
83
+ const response = await page.goto(url, {
84
+ waitUntil: 'networkidle',
85
+ timeout: options.timeout || 30000,
86
+ });
87
+ // Wait for Cloudflare challenge if present
88
+ await this.waitForChallenge(page);
89
+ const html = await page.content();
90
+ const statusCode = response?.status() || 200;
91
+ const finalUrl = page.url();
92
+ return { html, statusCode, finalUrl };
93
+ }
94
+ finally {
95
+ await page.close();
96
+ }
97
+ }
98
+ async humanize(page) {
99
+ // Random mouse movements
100
+ await page.mouse.move(100 + Math.random() * 500, 100 + Math.random() * 300);
101
+ // Random delay
102
+ await page.waitForTimeout(500 + Math.random() * 1000);
103
+ }
104
+ async waitForChallenge(page) {
105
+ // Wait for Cloudflare/other challenge to complete
106
+ const challengeSelectors = [
107
+ '#challenge-running',
108
+ '.cf-browser-verification',
109
+ '#challenge-form',
110
+ '[data-testid="challenge-running"]',
111
+ '.challenge-running',
112
+ ];
113
+ for (const selector of challengeSelectors) {
114
+ try {
115
+ const element = await page.$(selector);
116
+ if (element) {
117
+ // Challenge detected, wait for it to resolve
118
+ await page.waitForSelector(selector, {
119
+ state: 'detached',
120
+ timeout: 15000,
121
+ });
122
+ await page.waitForTimeout(2000);
123
+ break;
124
+ }
125
+ }
126
+ catch {
127
+ // Selector not found, continue
128
+ }
129
+ }
130
+ }
131
+ async close() {
132
+ if (this.context) {
133
+ await this.context.close();
134
+ this.context = null;
135
+ }
136
+ if (this.browser) {
137
+ await this.browser.close();
138
+ this.browser = null;
139
+ }
140
+ }
141
+ }
142
+ //# sourceMappingURL=rebrowser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rebrowser.js","sourceRoot":"","sources":["../../../src/engine/rebrowser.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH,IAAI,gBAAgB,GAAuC,IAAI,CAAC;AAOhE,MAAM,OAAO,eAAe;IAClB,OAAO,GAAmB,IAAI,CAAC;IAC/B,OAAO,GAA0B,IAAI,CAAC;IAE9C,KAAK,CAAC,KAAK,CAAC,GAAW,EAAE,UAA4B,EAAE;QAKrD,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACtB,gBAAgB,GAAG,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;QAChD,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,yCAAyC;YACzC,IAAI,CAAC,OAAO,GAAG,MAAM,gBAAgB,CAAC,QAAQ,CAAC,MAAM,CAAC;gBACpD,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE;oBACJ,+CAA+C;oBAC/C,yBAAyB;oBACzB,cAAc;oBACd,0BAA0B;oBAC1B,wBAAwB;oBACxB,oDAAoD;oBACpD,oBAAoB;oBACpB,uBAAuB;oBACvB,4BAA4B;iBAC7B;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC;gBAC3C,SAAS,EAAE,OAAO,CAAC,SAAS;oBAC1B,uHAAuH;gBACzH,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE;gBACvC,MAAM,EAAE,OAAO;gBACf,UAAU,EAAE,kBAAkB;gBAC9B,WAAW,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,OAAO,EAAE;gBACvD,WAAW,EAAE,CAAC,aAAa,CAAC;gBAC5B,iBAAiB,EAAE,CAAC;gBACpB,QAAQ,EAAE,KAAK;gBACf,QAAQ,EAAE,KAAK;aAChB,CAAC,CAAC;YAEH,sBAAsB;YACtB,MAAM,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,GAAG,EAAE;gBACpC,iBAAiB;gBACjB,MAAM,CAAC,cAAc,CAAC,SAAS,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC;gBAEpE,eAAe;gBACf,MAAM,CAAC,cAAc,CAAC,SAAS,EAAE,SAAS,EAAE;oBAC1C,GAAG,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;iBAC3B,CAAC,CAAC;gBAEH,iBAAiB;gBACjB,MAAM,CAAC,cAAc,CAAC,SAAS,EAAE,WAAW,EAAE;oBAC5C,GAAG,EAAE,GAAG,EAAE,CAAC,CAAC,OAAO,EAAE,IAAI,CAAC;iBAC3B,CAAC,CAAC;gBAEH,uBAAuB;gBACvB,MAAM,aAAa,GAAG,MAAM,CAAC,SAAS,CAAC,WAAW,CAAC,KAAK,CAAC;gBACzD,aAAa;gBACb,MAAM,CAAC,SAAS,CAAC,WAAW,CAAC,KAAK,GAAG,CAAC,UAAe,EAAE,EAAE,CACvD,UAAU,CAAC,IAAI,KAAK,eAAe;oBACjC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,KAAK,EAAE,YAAY,CAAC,UAAU,EAAsB,CAAC;oBACzE,CAAC,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC;gBAEhC,6BAA6B;gBAC7B,aAAa;gBACb,OAAO,MAAM,CAAC,gCAAgC,CAAC;gBAC/C,aAAa;gBACb,OAAO,MAAM,CAAC,kCAAkC,CAAC;gBACjD,aAAa;gBACb,OAAO,MAAM,CAAC,iCAAiC,CAAC;YAClD,CAAC,CAAC,CAAC;YAEH,wBAAwB;YACxB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;gBACzC,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;gBAC5C,IAAI,CAAC,OAAO,EAAE,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC5D,KAAK,CAAC,KAAK,EAAE,CAAC;gBAChB,CAAC;qBAAM,CAAC;oBACN,KAAK,CAAC,QAAQ,EAAE,CAAC;gBACnB,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAQ,CAAC,OAAO,EAAE,CAAC;QAE3C,IAAI,CAAC;YACH,+BAA+B;YAC/B,MAAM,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YAE1B,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACpC,SAAS,EAAE,aAAa;gBACxB,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;aAClC,CAAC,CAAC;YAEH,2CAA2C;YAC3C,MAAM,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC;YAElC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAClC,MAAM,UAAU,GAAG,QAAQ,EAAE,MAAM,EAAE,IAAI,GAAG,CAAC;YAC7C,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAE5B,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,CAAC;QACxC,CAAC;gBAAS,CAAC;YACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;QACrB,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,QAAQ,CAAC,IAAU;QAC/B,yBAAyB;QACzB,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,CACnB,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,EACzB,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAC1B,CAAC;QAEF,eAAe;QACf,MAAM,IAAI,CAAC,cAAc,CAAC,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC;IACxD,CAAC;IAEO,KAAK,CAAC,gBAAgB,CAAC,IAAU;QACvC,kDAAkD;QAClD,MAAM,kBAAkB,GAAG;YACzB,oBAAoB;YACpB,0BAA0B;YAC1B,iBAAiB;YACjB,mCAAmC;YACnC,oBAAoB;SACrB,CAAC;QAEF,KAAK,MAAM,QAAQ,IAAI,kBAAkB,EAAE,CAAC;YAC1C,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;gBACvC,IAAI,OAAO,EAAE,CAAC;oBACZ,6CAA6C;oBAC7C,MAAM,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE;wBACnC,KAAK,EAAE,UAAU;wBACjB,OAAO,EAAE,KAAK;qBACf,CAAC,CAAC;oBACH,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;oBAChC,MAAM;gBACR,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,+BAA+B;YACjC,CAAC;QACH,CAAC;IACH,CAAC;IAED,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,13 @@
1
+ /**
2
+ * HTML cleaner - removes unwanted elements before extraction
3
+ */
4
+ import type { CheerioAPI } from 'cheerio';
5
+ /**
6
+ * Clean HTML by removing unwanted elements
7
+ */
8
+ export declare function cleanHtml($: CheerioAPI): void;
9
+ /**
10
+ * Extract the main content area if identifiable
11
+ */
12
+ export declare function findMainContent($: CheerioAPI): string | null;
13
+ //# sourceMappingURL=cleaner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cleaner.d.ts","sourceRoot":"","sources":["../../../src/extractor/cleaner.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AA4D1C;;GAEG;AACH,wBAAgB,SAAS,CAAC,CAAC,EAAE,UAAU,GAAG,IAAI,CA+C7C;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,CAAC,EAAE,UAAU,GAAG,MAAM,GAAG,IAAI,CAyB5D"}
@@ -0,0 +1,122 @@
1
+ /**
2
+ * HTML cleaner - removes unwanted elements before extraction
3
+ */
4
+ // Elements to remove before extraction
5
+ const REMOVE_SELECTORS = [
6
+ // Scripts and styles
7
+ 'script', 'style', 'noscript', 'iframe', 'svg', 'canvas',
8
+ // Navigation and layout
9
+ 'nav', 'header', 'footer', 'aside',
10
+ '[role="banner"]', '[role="navigation"]', '[role="contentinfo"]',
11
+ '[role="complementary"]', '[role="menu"]', '[role="menubar"]',
12
+ // Common class patterns for navigation/layout
13
+ '.nav', '.navbar', '.navigation', '.header', '.footer', '.sidebar',
14
+ '.menu', '.top-bar', '.bottom-bar', '.site-header', '.site-footer',
15
+ '.page-header', '.page-footer', '.masthead',
16
+ // Ads and tracking
17
+ '.ad', '.ads', '.advertisement', '.advert', '.sponsored',
18
+ '[class*="ad-"]', '[class*="ads-"]', '[id*="google_ads"]',
19
+ '.tracking', '.analytics',
20
+ // Popups and overlays
21
+ '.popup', '.modal', '.overlay', '.lightbox', '.dialog',
22
+ '.cookie-banner', '.cookie-consent', '.cookie-notice',
23
+ '.gdpr', '.consent-banner',
24
+ // Social and sharing
25
+ '.social-share', '.share-buttons', '.share-links', '.social-links',
26
+ '.follow-us', '.social-icons',
27
+ // Comments
28
+ '.comments', '.comment-section', '.comment-form', '#comments', '#disqus',
29
+ '.discuss', '.discussion',
30
+ // Related/recommended content
31
+ '.related-posts', '.related-articles', '.recommended', '.suggestions',
32
+ '.more-from', '.you-may-like', '.also-read',
33
+ // Newsletter and subscriptions
34
+ '.newsletter', '.subscribe', '.subscription', '.signup-form',
35
+ '.email-signup', '.mailing-list',
36
+ // Breadcrumbs and pagination
37
+ '.breadcrumb', '.breadcrumbs', '.pagination', '.pager',
38
+ // Author bio (usually after article)
39
+ '.author-bio', '.author-box', '.about-author',
40
+ // Hidden elements
41
+ '[aria-hidden="true"]', '[hidden]', '.hidden', '.visually-hidden',
42
+ '.screen-reader-text', '.sr-only',
43
+ // Forms (except search)
44
+ 'form:not([role="search"])',
45
+ // Print-only elements
46
+ '.print-only', '.no-screen',
47
+ ];
48
+ /**
49
+ * Clean HTML by removing unwanted elements
50
+ */
51
+ export function cleanHtml($) {
52
+ // Remove unwanted elements
53
+ $(REMOVE_SELECTORS.join(', ')).remove();
54
+ // Remove empty elements (but preserve br, hr, img, etc.)
55
+ const preserveTags = new Set(['br', 'hr', 'img', 'input', 'meta', 'link', 'area', 'base', 'col', 'embed', 'source', 'track', 'wbr']);
56
+ $('*').each((_, el) => {
57
+ if (el.type !== 'tag')
58
+ return;
59
+ const $el = $(el);
60
+ const tagName = el.tagName?.toLowerCase();
61
+ if (preserveTags.has(tagName))
62
+ return;
63
+ // Remove if empty (no children and no text)
64
+ if (!$el.children().length && !$el.text().trim()) {
65
+ $el.remove();
66
+ }
67
+ });
68
+ // Remove data attributes (reduce noise, but keep data-language for code blocks)
69
+ $('*').each((_, el) => {
70
+ if (el.type !== 'tag')
71
+ return;
72
+ const attribs = el.attribs || {};
73
+ Object.keys(attribs).forEach(attr => {
74
+ if (attr.startsWith('data-') && attr !== 'data-language' && attr !== 'data-lang') {
75
+ $(el).removeAttr(attr);
76
+ }
77
+ });
78
+ });
79
+ // Remove inline event handlers
80
+ $('*').each((_, el) => {
81
+ if (el.type !== 'tag')
82
+ return;
83
+ const attribs = el.attribs || {};
84
+ Object.keys(attribs).forEach(attr => {
85
+ if (attr.startsWith('on')) {
86
+ $(el).removeAttr(attr);
87
+ }
88
+ });
89
+ });
90
+ // Remove style attributes
91
+ $('[style]').removeAttr('style');
92
+ // Remove class attributes (optional - can help reduce noise)
93
+ // $('[class]').removeAttr('class');
94
+ }
95
+ /**
96
+ * Extract the main content area if identifiable
97
+ */
98
+ export function findMainContent($) {
99
+ // Priority order for main content
100
+ const selectors = [
101
+ 'main',
102
+ 'article',
103
+ '[role="main"]',
104
+ '#main-content',
105
+ '#content',
106
+ '.main-content',
107
+ '.content',
108
+ '.post-content',
109
+ '.article-content',
110
+ '.entry-content',
111
+ '.post-body',
112
+ '.article-body',
113
+ ];
114
+ for (const selector of selectors) {
115
+ const $el = $(selector).first();
116
+ if ($el.length && $el.text().trim().length > 200) {
117
+ return $el.html() || null;
118
+ }
119
+ }
120
+ return null;
121
+ }
122
+ //# sourceMappingURL=cleaner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../../src/extractor/cleaner.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,uCAAuC;AACvC,MAAM,gBAAgB,GAAG;IACvB,qBAAqB;IACrB,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ;IAExD,wBAAwB;IACxB,KAAK,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO;IAClC,iBAAiB,EAAE,qBAAqB,EAAE,sBAAsB;IAChE,wBAAwB,EAAE,eAAe,EAAE,kBAAkB;IAE7D,8CAA8C;IAC9C,MAAM,EAAE,SAAS,EAAE,aAAa,EAAE,SAAS,EAAE,SAAS,EAAE,UAAU;IAClE,OAAO,EAAE,UAAU,EAAE,aAAa,EAAE,cAAc,EAAE,cAAc;IAClE,cAAc,EAAE,cAAc,EAAE,WAAW;IAE3C,mBAAmB;IACnB,KAAK,EAAE,MAAM,EAAE,gBAAgB,EAAE,SAAS,EAAE,YAAY;IACxD,gBAAgB,EAAE,iBAAiB,EAAE,oBAAoB;IACzD,WAAW,EAAE,YAAY;IAEzB,sBAAsB;IACtB,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS;IACtD,gBAAgB,EAAE,iBAAiB,EAAE,gBAAgB;IACrD,OAAO,EAAE,iBAAiB;IAE1B,qBAAqB;IACrB,eAAe,EAAE,gBAAgB,EAAE,cAAc,EAAE,eAAe;IAClE,YAAY,EAAE,eAAe;IAE7B,WAAW;IACX,WAAW,EAAE,kBAAkB,EAAE,eAAe,EAAE,WAAW,EAAE,SAAS;IACxE,UAAU,EAAE,aAAa;IAEzB,8BAA8B;IAC9B,gBAAgB,EAAE,mBAAmB,EAAE,cAAc,EAAE,cAAc;IACrE,YAAY,EAAE,eAAe,EAAE,YAAY;IAE3C,+BAA+B;IAC/B,aAAa,EAAE,YAAY,EAAE,eAAe,EAAE,cAAc;IAC5D,eAAe,EAAE,eAAe;IAEhC,6BAA6B;IAC7B,aAAa,EAAE,cAAc,EAAE,aAAa,EAAE,QAAQ;IAEtD,qCAAqC;IACrC,aAAa,EAAE,aAAa,EAAE,eAAe;IAE7C,kBAAkB;IAClB,sBAAsB,EAAE,UAAU,EAAE,SAAS,EAAE,kBAAkB;IACjE,qBAAqB,EAAE,UAAU;IAEjC,wBAAwB;IACxB,2BAA2B;IAE3B,sBAAsB;IACtB,aAAa,EAAE,YAAY;CAC5B,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,CAAa;IACrC,2BAA2B;IAC3B,CAAC,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;IAExC,yDAAyD;IACzD,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;IAErI,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;YAAE,OAAO;QAC9B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,CAAC;QAE1C,IAAI,YAAY,CAAC,GAAG,CAAC,OAAO,CAAC;YAAE,OAAO;QAEtC,4CAA4C;QAC5C,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC;YACjD,GAAG,CAAC,MAAM,EAAE,CAAC;QACf,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,gFAAgF;IAChF,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;YAAE,OAAO;QAC9B,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,IAAI,EAAE,CAAC;QACjC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;YAClC,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,IAAI,KAAK,eAAe,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;gBACjF,CAAC,CAAC,EAAE,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;YACzB,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,+BAA+B;IAC/B,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;YAAE,OAAO;QAC9B,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,IAAI,EAAE,CAAC;QACjC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;YAClC,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1B,CAAC,CAAC,EAAE,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;YACzB,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,0BAA0B;IAC1B,CAAC,CAAC,SAAS,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;IAEjC,6DAA6D;IAC7D,oCAAoC;AACtC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe,CAAC,CAAa;IAC3C,kCAAkC;IAClC,MAAM,SAAS,GAAG;QAChB,MAAM;QACN,SAAS;QACT,eAAe;QACf,eAAe;QACf,UAAU;QACV,eAAe;QACf,UAAU;QACV,eAAe;QACf,kBAAkB;QAClB,gBAAgB;QAChB,YAAY;QACZ,eAAe;KAChB,CAAC;IAEF,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;QAChC,IAAI,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YACjD,OAAO,GAAG,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Content extraction pipeline
3
+ * Uses Readability for main content extraction + node-html-markdown for conversion
4
+ */
5
+ export interface ExtractResult {
6
+ title: string;
7
+ markdown: string;
8
+ description: string;
9
+ author: string | null;
10
+ publishedDate: string | null;
11
+ links: string[];
12
+ wordCount: number;
13
+ byteSize: number;
14
+ }
15
+ export interface ExtractOptions {
16
+ includeLinks?: boolean;
17
+ maxContentLength?: number;
18
+ }
19
+ export declare class Extractor {
20
+ extract(html: string, url: string, options?: ExtractOptions): ExtractResult;
21
+ private extractTitle;
22
+ private extractDescription;
23
+ private extractAuthor;
24
+ private extractDate;
25
+ }
26
+ export { cleanHtml, findMainContent } from './cleaner.js';
27
+ export { extractLinks, extractInternalLinks, extractLinksWithMeta } from './links.js';
28
+ export { htmlToMarkdown, markdownToPlainText } from './markdown.js';
29
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/extractor/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AASH,MAAM,WAAW,aAAa;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,cAAc;IAC7B,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAOD,qBAAa,SAAS;IACpB,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,cAAmB,GAAG,aAAa;IAkE/E,OAAO,CAAC,YAAY;IAkCpB,OAAO,CAAC,kBAAkB;IAgB1B,OAAO,CAAC,aAAa;IAmBrB,OAAO,CAAC,WAAW;CAuBpB;AAED,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,oBAAoB,EAAE,oBAAoB,EAAE,MAAM,YAAY,CAAC;AACtF,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,MAAM,eAAe,CAAC"}
@@ -0,0 +1,162 @@
1
+ /**
2
+ * Content extraction pipeline
3
+ * Uses Readability for main content extraction + node-html-markdown for conversion
4
+ */
5
+ import { Readability } from '@mozilla/readability';
6
+ import { JSDOM } from 'jsdom';
7
+ import * as cheerio from 'cheerio';
8
+ import { cleanHtml, findMainContent } from './cleaner.js';
9
+ import { extractLinks } from './links.js';
10
+ import { htmlToMarkdown } from './markdown.js';
11
+ const DEFAULT_OPTIONS = {
12
+ includeLinks: true,
13
+ maxContentLength: 500000, // 500KB max content
14
+ };
15
+ export class Extractor {
16
+ extract(html, url, options = {}) {
17
+ const opts = { ...DEFAULT_OPTIONS, ...options };
18
+ const $ = cheerio.load(html);
19
+ // Extract metadata before cleaning
20
+ const title = this.extractTitle($, url);
21
+ const description = this.extractDescription($);
22
+ const author = this.extractAuthor($);
23
+ const publishedDate = this.extractDate($);
24
+ const links = opts.includeLinks ? extractLinks($, url) : [];
25
+ // Clean HTML
26
+ cleanHtml($);
27
+ // Try Readability first for article extraction
28
+ let markdown;
29
+ try {
30
+ const cleanedHtml = $.html();
31
+ const dom = new JSDOM(cleanedHtml, { url });
32
+ const reader = new Readability(dom.window.document, {
33
+ charThreshold: 50,
34
+ });
35
+ const article = reader.parse();
36
+ if (article?.content && article.textContent && article.textContent.length > 100) {
37
+ // Use node-html-markdown (faster than Turndown)
38
+ markdown = htmlToMarkdown(article.content);
39
+ // Use Readability's title if better
40
+ if (article.title && article.title.length > title.length) {
41
+ // title = article.title; // Uncomment if preferred
42
+ }
43
+ }
44
+ else {
45
+ // Fallback: try to find main content area
46
+ const mainContent = findMainContent($);
47
+ if (mainContent) {
48
+ markdown = htmlToMarkdown(mainContent);
49
+ }
50
+ else {
51
+ // Last resort: convert body
52
+ markdown = htmlToMarkdown($('body').html() || cleanedHtml);
53
+ }
54
+ }
55
+ }
56
+ catch {
57
+ // Fallback if Readability fails
58
+ markdown = htmlToMarkdown($.html());
59
+ }
60
+ // Truncate if too long
61
+ if (opts.maxContentLength && markdown.length > opts.maxContentLength) {
62
+ markdown = markdown.slice(0, opts.maxContentLength) + '\n\n[Content truncated...]';
63
+ }
64
+ const wordCount = markdown.split(/\s+/).filter(Boolean).length;
65
+ return {
66
+ title,
67
+ markdown,
68
+ description,
69
+ author,
70
+ publishedDate,
71
+ links,
72
+ wordCount,
73
+ byteSize: Buffer.byteLength(markdown, 'utf8'),
74
+ };
75
+ }
76
+ extractTitle($, url) {
77
+ const sources = [
78
+ $('meta[property="og:title"]').attr('content'),
79
+ $('meta[name="twitter:title"]').attr('content'),
80
+ $('title').text(),
81
+ $('h1').first().text(),
82
+ ];
83
+ for (const source of sources) {
84
+ if (source?.trim()) {
85
+ // Remove site name suffix (e.g., "Page Title | Site Name")
86
+ const cleaned = source.split(/\s*[|\-–—]\s*/)[0].trim();
87
+ if (cleaned.length > 0) {
88
+ return cleaned;
89
+ }
90
+ }
91
+ }
92
+ // Fallback to URL path
93
+ try {
94
+ const pathname = new URL(url).pathname;
95
+ if (pathname && pathname !== '/') {
96
+ return pathname
97
+ .replace(/\/$/, '')
98
+ .split('/')
99
+ .pop()
100
+ .replace(/[-_]/g, ' ')
101
+ .replace(/\.\w+$/, '');
102
+ }
103
+ }
104
+ catch { }
105
+ return url;
106
+ }
107
+ extractDescription($) {
108
+ const sources = [
109
+ $('meta[property="og:description"]').attr('content'),
110
+ $('meta[name="description"]').attr('content'),
111
+ $('meta[name="twitter:description"]').attr('content'),
112
+ ];
113
+ for (const source of sources) {
114
+ if (source?.trim()) {
115
+ return source.slice(0, 300);
116
+ }
117
+ }
118
+ return '';
119
+ }
120
+ extractAuthor($) {
121
+ const sources = [
122
+ $('meta[name="author"]').attr('content'),
123
+ $('meta[property="article:author"]').attr('content'),
124
+ $('[rel="author"]').first().text(),
125
+ $('[itemprop="author"]').first().text(),
126
+ $('.author').first().text(),
127
+ $('[class*="author-name"]').first().text(),
128
+ ];
129
+ for (const source of sources) {
130
+ if (source?.trim()) {
131
+ return source.trim().slice(0, 100);
132
+ }
133
+ }
134
+ return null;
135
+ }
136
+ extractDate($) {
137
+ const sources = [
138
+ $('meta[property="article:published_time"]').attr('content'),
139
+ $('meta[name="date"]').attr('content'),
140
+ $('meta[name="publish-date"]').attr('content'),
141
+ $('time[datetime]').attr('datetime'),
142
+ $('[itemprop="datePublished"]').attr('content'),
143
+ $('[itemprop="datePublished"]').attr('datetime'),
144
+ ];
145
+ for (const source of sources) {
146
+ if (source?.trim()) {
147
+ try {
148
+ const date = new Date(source);
149
+ if (!isNaN(date.getTime())) {
150
+ return date.toISOString();
151
+ }
152
+ }
153
+ catch { }
154
+ }
155
+ }
156
+ return null;
157
+ }
158
+ }
159
+ export { cleanHtml, findMainContent } from './cleaner.js';
160
+ export { extractLinks, extractInternalLinks, extractLinksWithMeta } from './links.js';
161
+ export { htmlToMarkdown, markdownToPlainText } from './markdown.js';
162
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/extractor/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AACnC,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAkB/C,MAAM,eAAe,GAAmB;IACtC,YAAY,EAAE,IAAI;IAClB,gBAAgB,EAAE,MAAM,EAAE,oBAAoB;CAC/C,CAAC;AAEF,MAAM,OAAO,SAAS;IACpB,OAAO,CAAC,IAAY,EAAE,GAAW,EAAE,UAA0B,EAAE;QAC7D,MAAM,IAAI,GAAG,EAAE,GAAG,eAAe,EAAE,GAAG,OAAO,EAAE,CAAC;QAChD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE7B,mCAAmC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QACxC,MAAM,WAAW,GAAG,IAAI,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC;QAC/C,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,aAAa,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE5D,aAAa;QACb,SAAS,CAAC,CAAC,CAAC,CAAC;QAEb,+CAA+C;QAC/C,IAAI,QAAgB,CAAC;QACrB,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAC7B,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YAC5C,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE;gBAClD,aAAa,EAAE,EAAE;aAClB,CAAC,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;YAE/B,IAAI,OAAO,EAAE,OAAO,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;gBAChF,gDAAgD;gBAChD,QAAQ,GAAG,cAAc,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;gBAE3C,oCAAoC;gBACpC,IAAI,OAAO,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;oBACzD,mDAAmD;gBACrD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,0CAA0C;gBAC1C,MAAM,WAAW,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;gBACvC,IAAI,WAAW,EAAE,CAAC;oBAChB,QAAQ,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;gBACzC,CAAC;qBAAM,CAAC;oBACN,4BAA4B;oBAC5B,QAAQ,GAAG,cAAc,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,WAAW,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,gCAAgC;YAChC,QAAQ,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QACtC,CAAC;QAED,uBAAuB;QACvB,IAAI,IAAI,CAAC,gBAAgB,IAAI,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACrE,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,gBAAgB,CAAC,GAAG,4BAA4B,CAAC;QACrF,CAAC;QAED,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAE/D,OAAO;YACL,KAAK;YACL,QAAQ;YACR,WAAW;YACX,MAAM;YACN,aAAa;YACb,KAAK;YACL,SAAS;YACT,QAAQ,EAAE,MAAM,CAAC,UAAU,CAAC,QAAQ,EAAE,MAAM,CAAC;SAC9C,CAAC;IACJ,CAAC;IAEO,YAAY,CAAC,CAAqB,EAAE,GAAW;QACrD,MAAM,OAAO,GAAG;YACd,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YAC9C,CAAC,CAAC,4BAA4B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YAC/C,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE;YACjB,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE;SACvB,CAAC;QAEF,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,IAAI,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC;gBACnB,2DAA2D;gBAC3D,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACxD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACvB,OAAO,OAAO,CAAC;gBACjB,CAAC;YACH,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,IAAI,QAAQ,IAAI,QAAQ,KAAK,GAAG,EAAE,CAAC;gBACjC,OAAO,QAAQ;qBACZ,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;qBAClB,KAAK,CAAC,GAAG,CAAC;qBACV,GAAG,EAAG;qBACN,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;qBACrB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;YAC3B,CAAC;QACH,CAAC;QAAC,MAAM,CAAC,CAAA,CAAC;QAEV,OAAO,GAAG,CAAC;IACb,CAAC;IAEO,kBAAkB,CAAC,CAAqB;QAC9C,MAAM,OAAO,GAAG;YACd,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YACpD,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YAC7C,CAAC,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;SACtD,CAAC;QAEF,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,IAAI,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC;gBACnB,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QAED,OAAO,EAAE,CAAC;IACZ,CAAC;IAEO,aAAa,CAAC,CAAqB;QACzC,MAAM,OAAO,GAAG;YACd,CAAC,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YACxC,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YACpD,CAAC,CAAC,gBAAgB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE;YAClC,CAAC,CAAC,qBAAqB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE;YACvC,CAAC,CAAC,SAAS,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE;YAC3B,CAAC,CAAC,wBAAwB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE;SAC3C,CAAC;QAEF,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,IAAI,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC;gBACnB,OAAO,MAAM,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YACrC,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,WAAW,CAAC,CAAqB;QACvC,MAAM,OAAO,GAAG;YACd,CAAC,CAAC,yCAAyC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YAC5D,CAAC,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YACtC,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YAC9C,CAAC,CAAC,gBAAgB,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC;YACpC,CAAC,CAAC,4BAA4B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YAC/C,CAAC,CAAC,4BAA4B,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC;SACjD,CAAC;QAEF,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,IAAI,MAAM,EAAE,IAAI,EAAE,EAAE,CAAC;gBACnB,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,CAAC;oBAC9B,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;wBAC3B,OAAO,IAAI,CAAC,WAAW,EAAE,CAAC;oBAC5B,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC,CAAA,CAAC;YACZ,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;CACF;AAED,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,oBAAoB,EAAE,oBAAoB,EAAE,MAAM,YAAY,CAAC;AACtF,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,MAAM,eAAe,CAAC"}