arcfetch 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ export const QualityConfigSchema = z.object({
6
6
  });
7
7
 
8
8
  export const PathsConfigSchema = z.object({
9
- tempDir: z.string().default('.tmp'),
9
+ tempDir: z.string().default('.tmp/arcfetch'),
10
10
  docsDir: z.string().default('docs/ai/references'),
11
11
  });
12
12
 
@@ -15,20 +15,13 @@ export const PlaywrightConfigSchema = z.object({
15
15
  waitStrategy: z.enum(['networkidle', 'domcontentloaded', 'load']).default('networkidle'),
16
16
  });
17
17
 
18
- export const RetryConfigSchema = z.object({
19
- maxAttempts: z.number().default(2),
20
- backoffMs: z.number().default(1000),
21
- });
22
-
23
18
  export const FetchiConfigSchema = z.object({
24
19
  quality: QualityConfigSchema.default({}),
25
20
  paths: PathsConfigSchema.default({}),
26
21
  playwright: PlaywrightConfigSchema.default({}),
27
- retry: RetryConfigSchema.default({}),
28
22
  });
29
23
 
30
24
  export type FetchiConfig = z.infer<typeof FetchiConfigSchema>;
31
25
  export type QualityConfig = z.infer<typeof QualityConfigSchema>;
32
26
  export type PathsConfig = z.infer<typeof PathsConfigSchema>;
33
27
  export type PlaywrightConfig = z.infer<typeof PlaywrightConfigSchema>;
34
- export type RetryConfig = z.infer<typeof RetryConfigSchema>;
package/src/core/cache.ts CHANGED
@@ -1,6 +1,6 @@
1
1
  import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
2
2
  import { writeFile } from 'node:fs/promises';
3
- import { join, basename } from 'node:path';
3
+ import { basename, join } from 'node:path';
4
4
  import type { FetchiConfig } from '../config/schema';
5
5
 
6
6
  export interface CachedReference {
@@ -43,7 +43,7 @@ export interface DeleteResult {
43
43
  */
44
44
  export function findByUrl(config: FetchiConfig, url: string): CachedReference | null {
45
45
  const { references } = listCached(config);
46
- return references.find(r => r.url === url) || null;
46
+ return references.find((r) => r.url === url) || null;
47
47
  }
48
48
 
49
49
  /**
@@ -89,9 +89,10 @@ export async function saveToTemp(
89
89
  const filepath = existing && refetch ? existing.filepath : join(tempDir, filename);
90
90
 
91
91
  const today = new Date().toISOString().split('T')[0];
92
+ const sanitizedUrl = url.replace(/[\r\n]/g, '');
92
93
  let fileContent = `---\n`;
93
94
  fileContent += `title: "${title.replace(/"/g, '\\"')}"\n`;
94
- fileContent += `source_url: ${url}\n`;
95
+ fileContent += `source_url: ${sanitizedUrl}\n`;
95
96
  fileContent += `fetched_date: ${today}\n`;
96
97
  fileContent += `type: web\n`;
97
98
  fileContent += `status: temporary\n`;
@@ -121,7 +122,7 @@ export function listCached(config: FetchiConfig): ListResult {
121
122
  return { references: [] };
122
123
  }
123
124
 
124
- const files = readdirSync(tempDir).filter(f => f.endsWith('.md'));
125
+ const files = readdirSync(tempDir).filter((f) => f.endsWith('.md'));
125
126
  const references: CachedReference[] = [];
126
127
 
127
128
  for (const file of files) {
@@ -135,7 +136,12 @@ export function listCached(config: FetchiConfig): ListResult {
135
136
 
136
137
  const getValue = (key: string): string => {
137
138
  const match = frontmatter.match(new RegExp(`^${key}:\\s*(.+)$`, 'm'));
138
- return match ? match[1].trim().replace(/^["']|["']$/g, '').trim() : '';
139
+ return match
140
+ ? match[1]
141
+ .trim()
142
+ .replace(/^["']|["']$/g, '')
143
+ .trim()
144
+ : '';
139
145
  };
140
146
 
141
147
  // Use filename (without .md) as refId
@@ -168,7 +174,7 @@ export function listCached(config: FetchiConfig): ListResult {
168
174
  */
169
175
  export function findCached(config: FetchiConfig, refId: string): CachedReference | null {
170
176
  const { references } = listCached(config);
171
- return references.find(r => r.refId === refId) || null;
177
+ return references.find((r) => r.refId === refId) || null;
172
178
  }
173
179
 
174
180
  /**
@@ -249,13 +255,6 @@ export function deleteCached(config: FetchiConfig, refId: string): DeleteResult
249
255
  }
250
256
  }
251
257
 
252
- /**
253
- * Get cache root (for backwards compatibility)
254
- */
255
- export function findCacheRoot(): string {
256
- return process.cwd();
257
- }
258
-
259
258
  // ============================================================================
260
259
  // LINK EXTRACTION
261
260
  // ============================================================================
@@ -1,7 +1,7 @@
1
- import TurndownService from 'turndown';
2
- import { gfm } from 'turndown-plugin-gfm';
3
1
  import { Readability } from '@mozilla/readability';
4
2
  import { parseHTML } from 'linkedom';
3
+ import TurndownService from 'turndown';
4
+ import { gfm } from 'turndown-plugin-gfm';
5
5
  import { cleanMarkdownComplete } from '../utils/markdown-cleaner';
6
6
 
7
7
  export interface ExtractionResult {
@@ -29,11 +29,7 @@ turndown.addRule('removeComments', {
29
29
  replacement: () => '',
30
30
  });
31
31
 
32
- export async function processHtmlToMarkdown(
33
- html: string,
34
- url: string,
35
- verbose = false
36
- ): Promise<ExtractionResult> {
32
+ export async function processHtmlToMarkdown(html: string, url: string, verbose = false): Promise<ExtractionResult> {
37
33
  try {
38
34
  if (verbose) {
39
35
  console.error(`📝 Processing HTML (${html.length} chars)`);
@@ -0,0 +1,85 @@
1
+ import type { FetchiConfig } from '../config/schema';
2
+ import { extractLinksFromCached, saveToTemp } from './cache';
3
+ import { closeBrowser, fetchUrl } from './pipeline';
4
+
5
+ export interface FetchLinkResult {
6
+ url: string;
7
+ status: 'new' | 'cached' | 'failed';
8
+ refId?: string;
9
+ error?: string;
10
+ }
11
+
12
+ export interface FetchLinksFromRefResult {
13
+ results: FetchLinkResult[];
14
+ summary: { new: number; cached: number; failed: number };
15
+ error?: string;
16
+ }
17
+
18
+ export async function fetchLinksFromRef(
19
+ config: FetchiConfig,
20
+ refId: string,
21
+ options?: { refetch?: boolean; verbose?: boolean; onProgress?: (result: FetchLinkResult) => void }
22
+ ): Promise<FetchLinksFromRefResult> {
23
+ const linksResult = extractLinksFromCached(config, refId);
24
+
25
+ if (linksResult.error) {
26
+ return { results: [], summary: { new: 0, cached: 0, failed: 0 }, error: linksResult.error };
27
+ }
28
+
29
+ if (linksResult.count === 0) {
30
+ return { results: [], summary: { new: 0, cached: 0, failed: 0 } };
31
+ }
32
+
33
+ const results: FetchLinkResult[] = [];
34
+ const concurrency = 5;
35
+ const urls = linksResult.links.map((l) => l.href);
36
+ const verbose = options?.verbose ?? false;
37
+ const refetch = options?.refetch ?? false;
38
+
39
+ for (let i = 0; i < urls.length; i += concurrency) {
40
+ const batch = urls.slice(i, i + concurrency);
41
+ const batchPromises = batch.map(async (url): Promise<FetchLinkResult> => {
42
+ try {
43
+ const fetchResult = await fetchUrl(url, config, verbose);
44
+
45
+ if (!fetchResult.success) {
46
+ return { url, status: 'failed', error: fetchResult.error };
47
+ }
48
+
49
+ const saveResult = await saveToTemp(config, fetchResult.title!, url, fetchResult.markdown!, undefined, refetch);
50
+
51
+ if (saveResult.error) {
52
+ return { url, status: 'failed', error: saveResult.error };
53
+ }
54
+
55
+ if (saveResult.alreadyExists) {
56
+ return { url, status: 'cached', refId: saveResult.refId };
57
+ }
58
+
59
+ return { url, status: 'new', refId: saveResult.refId };
60
+ } catch (error) {
61
+ const message = error instanceof Error ? error.message : String(error);
62
+ return { url, status: 'failed', error: message };
63
+ }
64
+ });
65
+
66
+ const batchResults = await Promise.all(batchPromises);
67
+ results.push(...batchResults);
68
+
69
+ if (options?.onProgress) {
70
+ for (const r of batchResults) {
71
+ options.onProgress(r);
72
+ }
73
+ }
74
+ }
75
+
76
+ await closeBrowser();
77
+
78
+ const summary = {
79
+ new: results.filter((r) => r.status === 'new').length,
80
+ cached: results.filter((r) => r.status === 'cached').length,
81
+ failed: results.filter((r) => r.status === 'failed').length,
82
+ };
83
+
84
+ return { results, summary };
85
+ }
package/src/core/index.ts CHANGED
@@ -1,4 +1,5 @@
1
- export * from './pipeline';
2
- export * from './extractor';
3
1
  export * from './cache';
2
+ export * from './extractor';
3
+ export * from './fetch-links';
4
+ export * from './pipeline';
4
5
  export * from './playwright/index';
@@ -1,7 +1,7 @@
1
1
  import type { FetchiConfig } from '../config/schema';
2
- import { validateMarkdown, type ValidationResult } from '../utils/markdown-validator';
3
- import { fetchWithBrowser, closeBrowser } from './playwright/manager';
2
+ import { type ValidationResult, validateMarkdown } from '../utils/markdown-validator';
4
3
  import { processHtmlToMarkdown } from './extractor';
4
+ import { closeBrowser, fetchWithBrowser } from './playwright/manager';
5
5
 
6
6
  export interface FetchResult {
7
7
  success: boolean;
@@ -31,8 +31,9 @@ async function simpleFetch(url: string, verbose = false): Promise<SimpleFetchRes
31
31
  const response = await fetch(url, {
32
32
  redirect: 'follow',
33
33
  headers: {
34
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
35
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
34
+ 'User-Agent':
35
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
36
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36
37
  'Accept-Language': 'en-US,en;q=0.5',
37
38
  },
38
39
  });
@@ -54,12 +55,7 @@ async function simpleFetch(url: string, verbose = false): Promise<SimpleFetchRes
54
55
  }
55
56
  }
56
57
 
57
- async function tryPlaywright(
58
- url: string,
59
- config: FetchiConfig,
60
- reason: string,
61
- verbose = false
62
- ): Promise<FetchResult> {
58
+ async function tryPlaywright(url: string, config: FetchiConfig, reason: string, verbose = false): Promise<FetchResult> {
63
59
  if (verbose) {
64
60
  console.error(`🎭 Trying Playwright (reason: ${reason})`);
65
61
  }
@@ -82,7 +78,7 @@ async function tryPlaywright(
82
78
  };
83
79
  }
84
80
 
85
- const quality = validateMarkdown(extracted.markdown!);
81
+ const quality = validateMarkdown(extracted.markdown!, { sourceHtmlLength: browserResult.html.length });
86
82
 
87
83
  if (quality.score < config.quality.minScore) {
88
84
  return {
@@ -114,9 +110,24 @@ export async function fetchUrl(
114
110
  verbose = false,
115
111
  forcePlaywright = false
116
112
  ): Promise<FetchResult> {
113
+ try {
114
+ const parsed = new URL(url);
115
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
116
+ return {
117
+ success: false,
118
+ error: `Invalid URL protocol: ${parsed.protocol} — only http and https are supported`,
119
+ };
120
+ }
121
+ } catch {
122
+ return {
123
+ success: false,
124
+ error: `Invalid URL: ${url}`,
125
+ };
126
+ }
127
+
117
128
  if (forcePlaywright) {
118
129
  if (verbose) {
119
- console.error(`⚡ Force Playwright mode enabled`);
130
+ console.error('⚡ Force Playwright mode enabled');
120
131
  }
121
132
  return tryPlaywright(url, config, 'forced', verbose);
122
133
  }
@@ -139,10 +150,15 @@ export async function fetchUrl(
139
150
  return tryPlaywright(url, config, 'extraction_failed', verbose);
140
151
  }
141
152
 
142
- const quality = validateMarkdown(extracted.markdown!);
153
+ const quality = validateMarkdown(extracted.markdown!, { sourceHtmlLength: simpleResult.html.length });
143
154
 
144
155
  if (verbose) {
145
156
  console.error(`📊 Quality score: ${quality.score}/100`);
157
+ if (quality.issues.length > 0) {
158
+ for (const issue of quality.issues) {
159
+ console.error(` ⚠ ${issue}`);
160
+ }
161
+ }
146
162
  }
147
163
 
148
164
  if (quality.score >= config.quality.jsRetryThreshold) {
@@ -1,2 +1,2 @@
1
- export * from './types';
2
1
  export * from './manager';
2
+ export * from './types';
@@ -1,5 +1,5 @@
1
- import { chromium } from 'playwright-extra';
2
1
  import type { Browser } from 'playwright';
2
+ import { chromium } from 'playwright-extra';
3
3
  import stealth from 'puppeteer-extra-plugin-stealth';
4
4
  import type { PlaywrightConfig } from '../../config/schema';
5
5
  import type { BrowserManager } from './types';
@@ -10,28 +10,37 @@ let browserInstance: Browser | null = null;
10
10
 
11
11
  export class LocalBrowserManager implements BrowserManager {
12
12
  private config: PlaywrightConfig;
13
-
13
+
14
14
  constructor(config: PlaywrightConfig) {
15
15
  this.config = config;
16
16
  }
17
-
17
+
18
18
  async getBrowser(): Promise<Browser> {
19
19
  if (!browserInstance) {
20
- browserInstance = await chromium.launch({
20
+ browserInstance = await chromium.launch({
21
21
  headless: true,
22
22
  timeout: this.config.timeout,
23
+ args: [
24
+ '--disable-blink-features=AutomationControlled',
25
+ '--disable-features=IsolateOrigins,site-per-process',
26
+ '--disable-infobars',
27
+ '--no-first-run',
28
+ '--no-default-browser-check',
29
+ '--disable-background-networking',
30
+ '--disable-dev-shm-usage',
31
+ ],
23
32
  });
24
33
  }
25
34
  return browserInstance;
26
35
  }
27
-
36
+
28
37
  async closeBrowser(): Promise<void> {
29
38
  if (browserInstance) {
30
39
  await browserInstance.close();
31
40
  browserInstance = null;
32
41
  }
33
42
  }
34
-
43
+
35
44
  isDocker(): boolean {
36
45
  return false;
37
46
  }
@@ -1,6 +1,6 @@
1
1
  import type { PlaywrightConfig } from '../../config/schema';
2
- import type { BrowserManager, FetchWithBrowserResult } from './types';
3
2
  import { LocalBrowserManager } from './local';
3
+ import type { BrowserManager, FetchWithBrowserResult } from './types';
4
4
 
5
5
  let currentManager: BrowserManager | null = null;
6
6
 
@@ -14,6 +14,23 @@ export async function getBrowserManager(config: PlaywrightConfig): Promise<Brows
14
14
  return currentManager;
15
15
  }
16
16
 
17
+ /** Common desktop viewport sizes to rotate through for fingerprint diversity */
18
+ const VIEWPORTS = [
19
+ { width: 1920, height: 1080 },
20
+ { width: 1536, height: 864 },
21
+ { width: 1440, height: 900 },
22
+ { width: 1366, height: 768 },
23
+ { width: 1280, height: 720 },
24
+ ];
25
+
26
+ const TIMEZONES = ['America/New_York', 'America/Chicago', 'America/Denver', 'America/Los_Angeles'];
27
+
28
+ const LOCALES = ['en-US', 'en-US', 'en-US', 'en-GB'];
29
+
30
+ function pick<T>(arr: T[]): T {
31
+ return arr[Math.floor(Math.random() * arr.length)];
32
+ }
33
+
17
34
  export async function fetchWithBrowser(
18
35
  url: string,
19
36
  config: PlaywrightConfig,
@@ -21,35 +38,90 @@ export async function fetchWithBrowser(
21
38
  ): Promise<FetchWithBrowserResult> {
22
39
  const manager = await getBrowserManager(config);
23
40
  const browser = await manager.getBrowser();
24
-
41
+
42
+ const viewport = pick(VIEWPORTS);
43
+ const locale = pick(LOCALES);
44
+ const timezone = pick(TIMEZONES);
45
+
46
+ const context = await browser.newContext({
47
+ viewport,
48
+ locale,
49
+ timezoneId: timezone,
50
+ userAgent:
51
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
52
+ // Realistic browser headers
53
+ extraHTTPHeaders: {
54
+ 'Accept-Language': `${locale},en;q=0.9`,
55
+ 'Accept-Encoding': 'gzip, deflate, br',
56
+ 'Sec-CH-UA': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
57
+ 'Sec-CH-UA-Mobile': '?0',
58
+ 'Sec-CH-UA-Platform': '"macOS"',
59
+ 'Sec-Fetch-Dest': 'document',
60
+ 'Sec-Fetch-Mode': 'navigate',
61
+ 'Sec-Fetch-Site': 'none',
62
+ 'Sec-Fetch-User': '?1',
63
+ 'Upgrade-Insecure-Requests': '1',
64
+ },
65
+ // Pretend we have granted permissions a real user would have
66
+ permissions: ['geolocation'],
67
+ deviceScaleFactor: 2,
68
+ isMobile: false,
69
+ hasTouch: false,
70
+ javaScriptEnabled: true,
71
+ });
72
+
73
+ const page = await context.newPage();
74
+
25
75
  try {
26
- const context = await browser.newContext({
27
- userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
28
- });
29
- const page = await context.newPage();
30
-
76
+ // Override navigator properties that leak headless signals
77
+ await page.addInitScript(`
78
+ Object.defineProperty(navigator, 'webdriver', { get: () => false });
79
+ Object.defineProperty(navigator, 'plugins', {
80
+ get: () => [
81
+ { name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer' },
82
+ { name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai' },
83
+ { name: 'Native Client', filename: 'internal-nacl-plugin' },
84
+ ],
85
+ });
86
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
87
+ Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 0 });
88
+ if (typeof Notification !== 'undefined') {
89
+ Object.defineProperty(Notification, 'permission', { get: () => 'default' });
90
+ }
91
+ window.chrome = window.chrome || {};
92
+ window.chrome.runtime = window.chrome.runtime || {};
93
+ `);
94
+
31
95
  if (verbose) {
32
- console.error(`🎭 Playwright: Navigating to ${url}`);
96
+ console.error(
97
+ `🎭 Playwright: Navigating to ${url} (${viewport.width}x${viewport.height}, ${locale}, ${timezone})`
98
+ );
33
99
  }
34
-
100
+
101
+ // Small random delay to avoid machine-like timing patterns
102
+ await page.waitForTimeout(200 + Math.floor(Math.random() * 300));
103
+
35
104
  await page.goto(url, {
36
105
  waitUntil: config.waitStrategy,
37
106
  timeout: config.timeout,
38
107
  });
39
-
108
+
109
+ // Wait a bit after load for lazy-loaded content / hydration
110
+ await page.waitForTimeout(500 + Math.floor(Math.random() * 500));
111
+
40
112
  const html = await page.content();
41
-
113
+
42
114
  if (verbose) {
43
115
  console.error(`🎭 Playwright: Got ${html.length} chars of HTML`);
44
116
  }
45
-
46
- await page.close();
47
- await context.close();
48
117
 
49
118
  return { html };
50
119
  } catch (error) {
51
120
  const message = error instanceof Error ? error.message : String(error);
52
121
  return { html: '', error: message };
122
+ } finally {
123
+ await page.close();
124
+ await context.close();
53
125
  }
54
126
  }
55
127
 
@@ -9,44 +9,44 @@ export function cleanMarkdown(markdown: string): string {
9
9
  return markdown;
10
10
  }
11
11
 
12
- let cleaned = markdown
12
+ let cleaned = markdown;
13
13
 
14
- cleaned = cleaned.replace(/\r\n/g, '\n')
15
- cleaned = cleaned.replace(/\n{3,}/g, '\n\n')
16
- cleaned = cleaned.replace(/[ \t]+$/gm, '')
17
- cleaned = cleaned.trim()
18
- cleaned = cleaned.replace(/([^\n])\n(#{1,6} )/g, '$1\n\n$2')
19
- cleaned = cleaned.replace(/(#{1,6} .+)\n([^#\n])/g, '$1\n\n$2')
20
- cleaned = cleaned.replace(/([^\n])\n([-*+] |\d+\. )/g, '$1\n\n$2')
21
- cleaned = cleaned.replace(/(\*|_) +/g, '$1')
22
- cleaned = cleaned.replace(/ +(\*|_)/g, '$1')
23
- cleaned = cleaned.replace(/([^\n])\n```/g, '$1\n\n```')
24
- cleaned = cleaned.replace(/```\n([^`])/g, '```\n\n$1')
25
- cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, ' ')
26
- cleaned = cleaned.replace(/ {2,}/g, ' ')
14
+ cleaned = cleaned.replace(/\r\n/g, '\n');
15
+ cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
16
+ cleaned = cleaned.replace(/[ \t]+$/gm, '');
17
+ cleaned = cleaned.trim();
18
+ cleaned = cleaned.replace(/([^\n])\n(#{1,6} )/g, '$1\n\n$2');
19
+ cleaned = cleaned.replace(/(#{1,6} .+)\n([^#\n])/g, '$1\n\n$2');
20
+ cleaned = cleaned.replace(/([^\n])\n([-*+] |\d+\. )/g, '$1\n\n$2');
21
+ cleaned = cleaned.replace(/(\*|_) +/g, '$1');
22
+ cleaned = cleaned.replace(/ +(\*|_)/g, '$1');
23
+ cleaned = cleaned.replace(/([^\n])\n```/g, '$1\n\n```');
24
+ cleaned = cleaned.replace(/```\n([^`])/g, '```\n\n$1');
25
+ cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, ' ');
26
+ cleaned = cleaned.replace(/ {2,}/g, ' ');
27
27
 
28
- return cleaned
28
+ return cleaned;
29
29
  }
30
30
 
31
31
  export function advancedClean(markdown: string): string {
32
- let cleaned = markdown
32
+ let cleaned = markdown;
33
33
 
34
- cleaned = cleaned.replace(/\[([^\]]+)\]\(\)/g, '$1')
35
- cleaned = cleaned.replace(/<[^>]+>/g, '')
36
- cleaned = cleaned.replace(/\*\*\*\*/g, '')
37
- cleaned = cleaned.replace(/(?<!\*)\*\*(?!\*)/g, '')
38
- cleaned = cleaned.replace(/__/g, '')
39
- cleaned = cleaned.replace(/!\[\]\(([^)]+)\)/g, '![]($1)')
40
- cleaned = cleaned.replace(/[\u200B-\u200D\uFEFF]/g, '')
41
- cleaned = cleaned.replace(/[\u201C\u201D]/g, '"')
42
- cleaned = cleaned.replace(/[\u2018\u2019]/g, "'")
43
- cleaned = cleaned.replace(/[\u2013\u2014]/g, '-')
34
+ cleaned = cleaned.replace(/\[([^\]]+)\]\(\)/g, '$1');
35
+ cleaned = cleaned.replace(/<[^>]+>/g, '');
36
+ cleaned = cleaned.replace(/\*\*\*\*/g, '');
37
+ cleaned = cleaned.replace(/(?<!\*)\*\*(?!\*)/g, '');
38
+ cleaned = cleaned.replace(/__/g, '');
39
+ cleaned = cleaned.replace(/!\[\]\(([^)]+)\)/g, '![]($1)');
40
+ cleaned = cleaned.replace(/[\u200B-\u200D\uFEFF]/g, '');
41
+ cleaned = cleaned.replace(/[\u201C\u201D]/g, '"');
42
+ cleaned = cleaned.replace(/[\u2018\u2019]/g, "'");
43
+ cleaned = cleaned.replace(/[\u2013\u2014]/g, '-');
44
44
 
45
45
  cleaned = cleaned.replace(/^(?!```)[^\n]*$/gm, (line) => {
46
- return line.replace(/ {2,}/g, ' ')
47
- })
46
+ return line.replace(/ {2,}/g, ' ');
47
+ });
48
48
 
49
- return cleaned
49
+ return cleaned;
50
50
  }
51
51
 
52
52
  export function finalCleanup(markdown: string): string {
@@ -54,16 +54,16 @@ export function finalCleanup(markdown: string): string {
54
54
  return markdown;
55
55
  }
56
56
 
57
- let cleaned = markdown
57
+ let cleaned = markdown;
58
58
 
59
- cleaned = cleaned.replace(/^(\s*)[*+] /gm, '$1- ')
60
- cleaned = cleaned.replace(/_([^_]+)_/g, '*$1*')
61
- cleaned = cleaned.replace(/^~~~(\w*)\n/gm, '```$1\n')
62
- cleaned = cleaned.replace(/^~~~$/gm, '```')
63
- cleaned = cleaned.replace(/\n{3,}/g, '\n\n')
64
- cleaned = `${cleaned.trim()}\n`
59
+ cleaned = cleaned.replace(/^(\s*)[*+] /gm, '$1- ');
60
+ cleaned = cleaned.replace(/_([^_]+)_/g, '*$1*');
61
+ cleaned = cleaned.replace(/^~~~(\w*)\n/gm, '```$1\n');
62
+ cleaned = cleaned.replace(/^~~~$/gm, '```');
63
+ cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
64
+ cleaned = `${cleaned.trim()}\n`;
65
65
 
66
- return cleaned
66
+ return cleaned;
67
67
  }
68
68
 
69
69
  export function cleanMarkdownComplete(markdown: string): string {
@@ -71,9 +71,9 @@ export function cleanMarkdownComplete(markdown: string): string {
71
71
  return markdown;
72
72
  }
73
73
 
74
- let cleaned = cleanMarkdown(markdown)
75
- cleaned = advancedClean(cleaned)
76
- cleaned = finalCleanup(cleaned)
74
+ let cleaned = cleanMarkdown(markdown);
75
+ cleaned = advancedClean(cleaned);
76
+ cleaned = finalCleanup(cleaned);
77
77
 
78
- return cleaned
78
+ return cleaned;
79
79
  }