extract-from-sitemap 0.0.17 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/README.md +98 -2
  2. package/cli.js +89 -272
  3. package/mod.js +307 -0
  4. package/package.json +1 -1
package/README.md CHANGED
@@ -4,15 +4,111 @@ This repo allows you to create a static markdown bundle based on one or multiple
4
4
 
5
5
  1. Create a `llmtext.json` file in the root of your project. This is where you define your sources to be extracted from. For an example combining multiple sources, see [this example](https://github.com/janwilmake/parallel-llmtext/blob/main/llmtext.json).
6
6
  2. Run `npx extract-from-sitemap` (or add it to your `package.json` scripts, [like this](https://github.com/janwilmake/parallel-llmtext/blob/main/package.json))
7
- 3. Set up CI/CD in your repo to automatically update your extracted static files as often as needed. **Example coming soon**
7
+ 3. Set up CI/CD in your repo to automatically update your extracted static files as often as needed. See [CI/CD Setup](#cicd-setup) below.
8
8
  4. Use an agent-rewriter such as [next-agent-rewriter](../next-agent-rewriter) to rewrite agent requests to the appropriate static markdown files. In addition, it's best practice to add a link in your html to show the markdown variant is available, like this: `<link rel="alternate" type="text/markdown" href="{path}.md" title="Docs" />`
9
9
 
10
+ ## File overview
11
+
12
+ - `mod.js` - the root to the npm pacakge to use the sitemap extraction programmatically
13
+ - `cli.js` - the cli usable through `npx extract-from-sitemap`. Adds functionality to have multiple sources and write from and to the file system
14
+
15
+ ## CI/CD Setup
16
+
17
+ ### GitHub Actions
18
+
19
+ 1. Get your Parallel API key from [platform.parallel.ai](https://platform.parallel.ai)
20
+
21
+ 2. Add it as a repository secret:
22
+
23
+ - Go to your repository → Settings → Secrets and variables → Actions
24
+ - Click "New repository secret"
25
+ - Name: `PARALLEL_API_KEY`
26
+ - Value: Your API key from step 1
27
+
28
+ 3. Create `.github/workflows/extract-docs.yml`:
29
+
30
+ ```yaml
31
+ name: Extract Documentation
32
+
33
+ on:
34
+ schedule:
35
+ - cron: "0 0 * * *" # Daily at midnight UTC
36
+ workflow_dispatch: # Allow manual trigger
37
+
38
+ jobs:
39
+ extract:
40
+ runs-on: ubuntu-latest
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+ - uses: actions/setup-node@v4
44
+ with:
45
+ node-version: "20"
46
+
47
+ - name: Extract documentation
48
+ env:
49
+ PARALLEL_API_KEY: ${{ secrets.PARALLEL_API_KEY }}
50
+ run: |
51
+ npm install -g extract-from-sitemap
52
+ npx extract-from-sitemap
53
+
54
+ - name: Commit changes
55
+ run: |
56
+ git config user.email "github-actions[bot]@users.noreply.github.com"
57
+ git config user.name "github-actions[bot]"
58
+ git add .
59
+ git diff --quiet && git diff --staged --quiet || \
60
+ (git commit -m "Update docs [skip ci]" && git push)
61
+ ```
62
+
63
+ ### GitLab CI
64
+
65
+ 1. Add `PARALLEL_API_KEY` as a CI/CD variable:
66
+
67
+ - Go to Settings → CI/CD → Variables
68
+ - Add variable with your API key
69
+ - Make sure "Protect variable" and "Mask variable" are checked
70
+
71
+ 2. Create `.gitlab-ci.yml`:
72
+
73
+ ```yaml
74
+ extract-docs:
75
+ image: node:20
76
+ script:
77
+ - npm install -g extract-from-sitemap
78
+ - npx extract-from-sitemap
79
+ - |
80
+ git config user.email "gitlab-ci@gitlab.com"
81
+ git config user.name "GitLab CI"
82
+ git add docs/
83
+ git diff --quiet && git diff --staged --quiet || \
84
+ (git commit -m "Update docs [skip ci]" && git push https://oauth2:${CI_JOB_TOKEN}@${CI_SERVER_HOST}/${CI_PROJECT_PATH}.git HEAD:${CI_COMMIT_REF_NAME})
85
+ only:
86
+ - schedules
87
+ - web
88
+ ```
89
+
90
+ ### Other CI Systems
91
+
92
+ The CLI automatically detects CI environments and will require the `PARALLEL_API_KEY` environment variable to be set. It will not attempt OAuth flow in CI environments.
93
+
94
+ Supported CI detection:
95
+
96
+ - GitHub Actions
97
+ - GitLab CI
98
+ - CircleCI
99
+ - Travis CI
100
+ - Jenkins
101
+ - Buildkite
102
+ - Drone
103
+ - Semaphore
104
+ - Any system with `CI=true` or `CONTINUOUS_INTEGRATION=true`
105
+
10
106
  ## Known limitations
11
107
 
12
108
  This library is in active development. Known limitations:
13
109
 
14
110
  - Does not work for nested sitemaps
15
111
  - Does not work on sitemaps that are too large
16
- - Example to make it recurring is still missing
112
+ - Some CI systems may require additional git configuration
17
113
 
18
114
  I am working on addressing these issues.
package/cli.js CHANGED
@@ -7,30 +7,30 @@ const crypto = require("crypto");
7
7
  const http = require("http");
8
8
  const { URL, URLSearchParams } = require("url");
9
9
  const os = require("os");
10
- const { extractFromSitemap } = require("./mod.js");
11
-
12
- /**
13
- * @typedef {Object} SourceConfig
14
- * @property {string} title - The title for this source
15
- * @property {string} [origin] - The origin URL to process (optional)
16
- * @property {string} [outDir] - Output directory for this source's extracted files
17
- * @property {boolean} [forceExtract] - Whether to force extraction for this source
18
- * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
19
- * @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
20
- * @property {string} [titleRemovePattern] - Regex pattern to remove from titles (case-insensitive)
21
- */
22
- /**
23
- * @typedef {Object} Config
24
- * @property {string} title - Title of your document
25
- * @property {string} description - Description of the documentation collection
26
- * @property {string} [details] - Optional additional details about the collection
27
- * @property {string} outDir - Top-level output directory for combined llms.txt
28
- * @property {SourceConfig[]} sources - Array of source configurations
29
- */
10
+ const { processLLMTextConfig } = require("./mod.js");
30
11
 
31
12
  const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
32
13
  const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
33
14
 
15
+ /**
16
+ * Detect if running in a CI environment
17
+ * @returns {boolean}
18
+ */
19
+ function isCI() {
20
+ return !!(
21
+ process.env.CI || // Generic CI flag
22
+ process.env.CONTINUOUS_INTEGRATION ||
23
+ process.env.GITHUB_ACTIONS ||
24
+ process.env.GITLAB_CI ||
25
+ process.env.CIRCLECI ||
26
+ process.env.TRAVIS ||
27
+ process.env.JENKINS_URL ||
28
+ process.env.BUILDKITE ||
29
+ process.env.DRONE ||
30
+ process.env.SEMAPHORE
31
+ );
32
+ }
33
+
34
34
  /**
35
35
  * OAuth handler for Parallel.ai API key authentication
36
36
  */
@@ -201,7 +201,7 @@ class OAuth {
201
201
 
202
202
  /**
203
203
  * Load configuration from llmtext.json
204
- * @returns {Promise<Config>} The configuration object
204
+ * @returns {Promise<any>} The configuration object
205
205
  */
206
206
  async function loadConfig() {
207
207
  const configPath = path.resolve("llmtext.json");
@@ -385,13 +385,13 @@ function loadStoredApiKey() {
385
385
  * @returns {Promise<string>} The API key
386
386
  */
387
387
  async function getApiKey() {
388
- // Check stored API key first
389
- const storedKey = loadStoredApiKey();
390
- if (storedKey) {
391
- return storedKey;
388
+ const inCI = isCI();
389
+
390
+ if (inCI) {
391
+ console.log("🔍 CI environment detected");
392
392
  }
393
393
 
394
- // Check environment variables
394
+ // Check environment variables first (most important for CI)
395
395
  let apiKey = process.env.PARALLEL_API_KEY;
396
396
 
397
397
  if (!apiKey && fs.existsSync(".env")) {
@@ -405,11 +405,33 @@ async function getApiKey() {
405
405
 
406
406
  if (apiKey) {
407
407
  console.log("🔑 Using API key from environment");
408
- storeApiKey(apiKey);
408
+ if (!inCI) {
409
+ storeApiKey(apiKey);
410
+ }
409
411
  return apiKey;
410
412
  }
411
413
 
412
- // No API key found, start OAuth flow
414
+ // In CI environments, we cannot do OAuth - require the env var
415
+ if (inCI) {
416
+ console.error("\n❌ No API key found in CI environment!");
417
+ console.error("\nPlease set the PARALLEL_API_KEY environment variable:");
418
+ console.error(" - For GitHub Actions: Add it as a repository secret");
419
+ console.error(" - For GitLab CI: Add it as a CI/CD variable");
420
+ console.error(
421
+ " - For other CI systems: Add it as an environment variable"
422
+ );
423
+ console.error("\nYou can get your API key from:");
424
+ console.error(" https://platform.parallel.ai");
425
+ process.exit(1);
426
+ }
427
+
428
+ // Check stored API key (only in non-CI environments)
429
+ const storedKey = loadStoredApiKey();
430
+ if (storedKey) {
431
+ return storedKey;
432
+ }
433
+
434
+ // No API key found, start OAuth flow (only in interactive environments)
413
435
  console.log("🔑 No API key found. Starting OAuth flow...");
414
436
  const oauth = new OAuth();
415
437
  const newApiKey = await oauth.getApiKey();
@@ -418,129 +440,6 @@ async function getApiKey() {
418
440
  return newApiKey;
419
441
  }
420
442
 
421
- /**
422
- * Process custom URLs through extraction API
423
- * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
424
- * @param {string} apiKey - API key for authentication
425
- * @returns {Promise<Record<string, any>>} Extracted files
426
- */
427
- async function processCustomUrls(customUrls, apiKey) {
428
- const files = {};
429
-
430
- for (const customUrl of customUrls) {
431
- console.log(`📄 Processing custom URL: ${customUrl.url}`);
432
-
433
- try {
434
- const response = await fetch("https://api.parallel.ai/v1beta/extract", {
435
- method: "POST",
436
- headers: {
437
- "Content-Type": "application/json",
438
- "parallel-beta": "search-extract-2025-10-10",
439
- "x-api-key": apiKey,
440
- },
441
- body: JSON.stringify({
442
- urls: [customUrl.url],
443
- full_content: true,
444
- }),
445
- });
446
-
447
- if (response.ok) {
448
- const result = await response.json();
449
- if (result.results && result.results.length > 0) {
450
- const extracted = result.results[0];
451
- const filename = customUrl.filename + ".md";
452
-
453
- files[filename] = {
454
- content: extracted.full_content || "",
455
- title: customUrl.title,
456
- description: customUrl.description,
457
- extracted: true,
458
- publishedDate: extracted.published_date || "",
459
- status: 200,
460
- tokens: Math.round((extracted.full_content || "").length / 5),
461
- originalUrl: customUrl.url,
462
- };
463
- }
464
- } else {
465
- throw new Error(`${response.status} - ${await response.statusText()}`);
466
- }
467
- } catch (error) {
468
- console.error(
469
- `❌ Error processing custom URL ${customUrl.url}:`,
470
- error.message
471
- );
472
- }
473
- }
474
-
475
- return files;
476
- }
477
-
478
- /**
479
- * Get path prefix for links in llms.txt
480
- * @param {string} topLevelOutDir - Top-level output directory
481
- * @param {string} sourceOutDir - Source-specific output directory
482
- * @returns {string} Path prefix for links
483
- */
484
- function getPathPrefix(topLevelOutDir, sourceOutDir) {
485
- const resolvedTopLevel = path.resolve(topLevelOutDir);
486
- const resolvedSource = path.resolve(sourceOutDir);
487
-
488
- if (resolvedSource === resolvedTopLevel) {
489
- return "";
490
- }
491
-
492
- const relativePath = path.relative(resolvedTopLevel, resolvedSource);
493
- return relativePath || "";
494
- }
495
-
496
- /**
497
- * Generate combined llms.txt from all sources
498
- * @param {string} title - Top-level title
499
- * @param {string} description - Top-level description
500
- * @param {string} [details] - Optional top-level details
501
- * @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
502
- * @returns {string} Combined llms.txt content
503
- */
504
- function generateCombinedLlmsTxt(title, description, details, allSources) {
505
- let combinedTxt = `# ${title}\n\n> ${description}\n\n`;
506
-
507
- if (details) {
508
- combinedTxt += `${details}\n\n`;
509
- }
510
-
511
- for (const source of allSources) {
512
- combinedTxt += `## ${source.title}\n\n`;
513
-
514
- // Sort files by path for consistent ordering
515
- const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
516
- a.localeCompare(b)
517
- );
518
-
519
- for (const [path, file] of sortedFiles) {
520
- if (file.content || file.title) {
521
- const title = file.title || path.replace(".md", "");
522
- const description = file.description
523
- ? `: ${file.description.replaceAll("\n", " ")}`
524
- : "";
525
-
526
- // Generate link based on keepOriginalUrls and pathPrefix
527
- let link;
528
- if (source.keepOriginalUrls) {
529
- link = file.originalUrl;
530
- } else {
531
- link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
532
- }
533
-
534
- combinedTxt += `- [${title}](${link})${description}\n`;
535
- }
536
- }
537
-
538
- combinedTxt += "\n";
539
- }
540
-
541
- return combinedTxt;
542
- }
543
-
544
443
  /**
545
444
  * Clear stored API key credentials
546
445
  */
@@ -557,6 +456,33 @@ async function clearCredentials() {
557
456
  }
558
457
  }
559
458
 
459
+ /**
460
+ * Write file hierarchy to disk
461
+ * @param {Record<string, {content?: string, error?: string}>} fileHierarchy - File hierarchy to write
462
+ */
463
+ function writeFileHierarchy(fileHierarchy) {
464
+ for (const [filePath, item] of Object.entries(fileHierarchy)) {
465
+ try {
466
+ const resolvedPath = path.resolve(filePath);
467
+ const fileDir = path.dirname(resolvedPath);
468
+
469
+ // Create directory if it doesn't exist
470
+ fs.mkdirSync(fileDir, { recursive: true });
471
+
472
+ if (item.content) {
473
+ fs.writeFileSync(resolvedPath, item.content);
474
+ console.log(`📝 Wrote: ${filePath}`);
475
+ } else if (item.error) {
476
+ console.error(`❌ Error for ${filePath}: ${item.error}`);
477
+ }
478
+ } catch (error) {
479
+ console.error(
480
+ `❌ Failed to write ${filePath}: ${error.message || "Unknown error"}`
481
+ );
482
+ }
483
+ }
484
+ }
485
+
560
486
  /**
561
487
  * Main function
562
488
  */
@@ -574,131 +500,22 @@ async function main() {
574
500
  const config = await loadConfig();
575
501
  const apiKey = await getApiKey();
576
502
 
577
- // Ensure top-level output directory exists
578
- fs.mkdirSync(config.outDir, { recursive: true });
579
-
580
- const allSources = [];
581
- let totalTokens = 0;
582
- let totalPages = 0;
583
- let totalErrors = 0;
584
-
585
- // Process each source
586
- for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
587
- const sourceName = `${sourceConfig.title} (source ${sourceIndex + 1})`;
588
-
589
- console.log(
590
- `\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
591
- );
592
-
593
- // Ensure source output directory exists (if not keeping original URLs)
594
- if (!sourceConfig.keepOriginalUrls) {
595
- fs.mkdirSync(sourceConfig.outDir, { recursive: true });
596
- }
597
-
598
- let sourceFiles = {};
599
-
600
- try {
601
- // Process origin if provided
602
- if (sourceConfig.origin) {
603
- const result = await extractFromSitemap(
604
- sourceConfig.origin,
605
- sourceConfig.forceExtract,
606
- apiKey,
607
- sourceConfig.titleRemovePattern
608
- );
609
-
610
- console.log(
611
- `✅ Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
612
- );
613
- if (result.errors > 0) {
614
- console.log(`⚠️ ${result.errors} errors occurred`);
615
- }
616
-
617
- sourceFiles = result.files;
618
- totalTokens += result.totalTokens;
619
- totalPages += result.totalPages;
620
- totalErrors += result.errors;
621
- }
622
-
623
- // Process custom URLs for this source
624
- if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
625
- console.log(
626
- `📋 Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
627
- );
628
- const customFiles = await processCustomUrls(
629
- sourceConfig.customUrls,
630
- apiKey
631
- );
632
-
633
- // Merge custom files with sitemap files
634
- sourceFiles = { ...sourceFiles, ...customFiles };
635
-
636
- for (const file of Object.values(customFiles)) {
637
- totalTokens += file.tokens;
638
- totalPages++;
639
- }
640
- }
503
+ console.log("\n🔄 Processing LLMText configuration...");
641
504
 
642
- // Write files to source directory (only if not keeping original URLs)
643
- if (!sourceConfig.keepOriginalUrls) {
644
- for (const [filePath, file] of Object.entries(sourceFiles)) {
645
- let filename = filePath.startsWith("/")
646
- ? filePath.slice(1)
647
- : filePath;
505
+ // Process the entire config using the new function
506
+ const result = await processLLMTextConfig(config, apiKey);
648
507
 
649
- const fullFilePath = path.join(sourceConfig.outDir, filename);
650
- const fileDir = path.dirname(fullFilePath);
651
-
652
- fs.mkdirSync(fileDir, { recursive: true });
653
- fs.writeFileSync(fullFilePath, file.content);
654
-
655
- console.log(
656
- `📝 Wrote: ${path.join(sourceConfig.outDir, filename)} (${
657
- file.tokens
658
- } tokens)`
659
- );
660
- }
661
- } else {
662
- console.log(
663
- `📋 Keeping original URLs - not saving files locally for ${sourceName}`
664
- );
665
- }
666
-
667
- // Calculate path prefix for this source
668
- const pathPrefix = sourceConfig.keepOriginalUrls
669
- ? ""
670
- : getPathPrefix(config.outDir, sourceConfig.outDir);
671
-
672
- // Add to all sources for combined llms.txt
673
- allSources.push({
674
- title: sourceConfig.title,
675
- files: sourceFiles,
676
- keepOriginalUrls: sourceConfig.keepOriginalUrls,
677
- pathPrefix: pathPrefix,
678
- });
679
- } catch (error) {
680
- console.error(`❌ Error processing ${sourceName}:`, error.message);
681
- totalErrors++;
682
- }
683
- }
684
-
685
- // Generate and write combined llms.txt to top-level outDir
686
- if (allSources.length > 0) {
687
- const combinedLlmsTxt = generateCombinedLlmsTxt(
688
- config.title,
689
- config.description,
690
- config.details,
691
- allSources
692
- );
693
- const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
694
- fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
695
- console.log(`\n📋 Generated combined llms.txt: ${combinedLlmsTxtPath}`);
696
- }
508
+ // Write all files to disk
509
+ console.log("\n📁 Writing files to disk...");
510
+ writeFileHierarchy(result.files);
697
511
 
512
+ // Print summary
698
513
  console.log("\n✨ Extraction completed!");
699
- console.log(`📊 Total: ${totalPages} pages, ${totalTokens} tokens`);
700
- if (totalErrors > 0) {
701
- console.log(`⚠️ Errors: ${totalErrors}`);
514
+ console.log(
515
+ `📊 Total: ${result.stats.totalPages} pages, ${result.stats.totalTokens} tokens`
516
+ );
517
+ if (result.stats.totalErrors > 0) {
518
+ console.log(`⚠️ Errors: ${result.stats.totalErrors}`);
702
519
  }
703
520
  console.log(
704
521
  `📁 Top-level output directory: ${path.resolve(config.outDir)}`
package/mod.js CHANGED
@@ -22,6 +22,40 @@
22
22
  * @property {number} fetchCount - Number of fetch operations performed
23
23
  */
24
24
 
25
+ /**
26
+ * @typedef {Object} SourceConfig
27
+ * @property {string} title - The title for this source
28
+ * @property {string} [origin] - The origin URL to process (optional)
29
+ * @property {string} [outDir] - Output directory for this source's extracted files
30
+ * @property {boolean} [forceExtract] - Whether to force extraction for this source
31
+ * @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
32
+ * @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
33
+ * @property {string} [titleRemovePattern] - Regex pattern to remove from titles (case-insensitive)
34
+ */
35
+
36
+ /**
37
+ * @typedef {Object} LLMTextConfig
38
+ * @property {string} title - Title of your document
39
+ * @property {string} description - Description of the documentation collection
40
+ * @property {string} [details] - Optional additional details about the collection
41
+ * @property {string} outDir - Top-level output directory for combined llms.txt
42
+ * @property {SourceConfig[]} sources - Array of source configurations
43
+ */
44
+
45
+ /**
46
+ * @typedef {Object} FileHierarchyItem
47
+ * @property {string} [content] - File content if successful
48
+ * @property {string} [error] - Error message if failed
49
+ */
50
+
51
+ /**
52
+ * @typedef {Object} ProcessedSource
53
+ * @property {string} title - Source title
54
+ * @property {Record<string, FileResult>} files - Extracted files
55
+ * @property {boolean} keepOriginalUrls - Whether to keep original URLs
56
+ * @property {string} pathPrefix - Path prefix for links
57
+ */
58
+
25
59
  /**
26
60
  * Extract content from sitemap URLs with markdown variant detection
27
61
  * @param {string} origin - The origin URL to extract from
@@ -176,6 +210,279 @@ export async function extractFromSitemap(
176
210
  };
177
211
  }
178
212
 
213
+ /**
214
+ * Process custom URLs through extraction API
215
+ * @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
216
+ * @param {string} apiKey - API key for authentication
217
+ * @returns {Promise<Record<string, FileResult>>} Extracted files
218
+ */
219
+ export async function processCustomUrls(customUrls, apiKey) {
220
+ const files = {};
221
+
222
+ for (const customUrl of customUrls) {
223
+ try {
224
+ const response = await fetch("https://api.parallel.ai/v1beta/extract", {
225
+ method: "POST",
226
+ headers: {
227
+ "Content-Type": "application/json",
228
+ "parallel-beta": "search-extract-2025-10-10",
229
+ "x-api-key": apiKey,
230
+ },
231
+ body: JSON.stringify({
232
+ urls: [customUrl.url],
233
+ full_content: true,
234
+ }),
235
+ });
236
+
237
+ if (response.ok) {
238
+ const result = await response.json();
239
+ if (result.results && result.results.length > 0) {
240
+ const extracted = result.results[0];
241
+ const filename = customUrl.filename + ".md";
242
+
243
+ files[filename] = {
244
+ content: extracted.full_content || "",
245
+ title: customUrl.title,
246
+ description: customUrl.description,
247
+ extracted: true,
248
+ publishedDate: extracted.published_date || "",
249
+ status: 200,
250
+ tokens: Math.round((extracted.full_content || "").length / 5),
251
+ originalUrl: customUrl.url,
252
+ };
253
+ }
254
+ } else {
255
+ throw new Error(`${response.status} - ${await response.statusText()}`);
256
+ }
257
+ } catch (error) {
258
+ const filename = customUrl.filename + ".md";
259
+ files[filename] = {
260
+ error: error instanceof Error ? error.message : "Unknown error",
261
+ content: "",
262
+ title: customUrl.title,
263
+ description: customUrl.description,
264
+ extracted: false,
265
+ status: 0,
266
+ tokens: 0,
267
+ publishedDate: "",
268
+ originalUrl: customUrl.url,
269
+ };
270
+ }
271
+ }
272
+
273
+ return files;
274
+ }
275
+
276
+ /**
277
+ * Process LLMText config and generate file hierarchy
278
+ * @param {LLMTextConfig} config - The LLMText configuration
279
+ * @param {string} apiKey - Parallel API key
280
+ * @returns {Promise<{files: Record<string, FileHierarchyItem>, sources: ProcessedSource[], stats: {totalTokens: number, totalPages: number, totalErrors: number}}>}
281
+ */
282
+ export async function processLLMTextConfig(config, apiKey) {
283
+ const allSources = [];
284
+ let totalTokens = 0;
285
+ let totalPages = 0;
286
+ let totalErrors = 0;
287
+
288
+ // Process each source
289
+ for (const sourceConfig of config.sources) {
290
+ let sourceFiles = {};
291
+
292
+ try {
293
+ // Process origin if provided
294
+ if (sourceConfig.origin) {
295
+ const result = await extractFromSitemap(
296
+ sourceConfig.origin,
297
+ sourceConfig.forceExtract || false,
298
+ apiKey,
299
+ sourceConfig.titleRemovePattern
300
+ );
301
+
302
+ sourceFiles = result.files;
303
+ totalTokens += result.totalTokens;
304
+ totalPages += result.totalPages;
305
+ totalErrors += result.errors;
306
+ }
307
+
308
+ // Process custom URLs for this source
309
+ if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
310
+ const customFiles = await processCustomUrls(
311
+ sourceConfig.customUrls,
312
+ apiKey
313
+ );
314
+
315
+ // Merge custom files with sitemap files
316
+ sourceFiles = { ...sourceFiles, ...customFiles };
317
+
318
+ for (const file of Object.values(customFiles)) {
319
+ totalTokens += file.tokens;
320
+ totalPages++;
321
+ if (file.error) totalErrors++;
322
+ }
323
+ }
324
+
325
+ // Calculate path prefix for this source
326
+ const pathPrefix = sourceConfig.keepOriginalUrls
327
+ ? ""
328
+ : getPathPrefix(config.outDir, sourceConfig.outDir || config.outDir);
329
+
330
+ // Add to all sources
331
+ allSources.push({
332
+ title: sourceConfig.title,
333
+ files: sourceFiles,
334
+ keepOriginalUrls: sourceConfig.keepOriginalUrls || false,
335
+ pathPrefix: pathPrefix,
336
+ outDir: sourceConfig.outDir || config.outDir,
337
+ });
338
+ } catch (error) {
339
+ totalErrors++;
340
+ // Add empty source with error
341
+ allSources.push({
342
+ title: sourceConfig.title,
343
+ files: {
344
+ error: {
345
+ error: error instanceof Error ? error.message : "Unknown error",
346
+ content: "",
347
+ title: "",
348
+ description: "",
349
+ extracted: false,
350
+ status: 0,
351
+ tokens: 0,
352
+ publishedDate: "",
353
+ originalUrl: "",
354
+ },
355
+ },
356
+ keepOriginalUrls: sourceConfig.keepOriginalUrls || false,
357
+ pathPrefix: "",
358
+ outDir: sourceConfig.outDir || config.outDir,
359
+ });
360
+ }
361
+ }
362
+
363
+ // Generate file hierarchy
364
+ const fileHierarchy = {};
365
+
366
+ // Add source files
367
+ for (const source of allSources) {
368
+ if (!source.keepOriginalUrls) {
369
+ for (const [filePath, file] of Object.entries(source.files)) {
370
+ let filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
371
+ const fullPath = `${source.outDir}/${filename}`;
372
+
373
+ fileHierarchy[fullPath] = file.error
374
+ ? { error: file.error }
375
+ : { content: file.content };
376
+ }
377
+ }
378
+ }
379
+
380
+ // Generate combined llms.txt
381
+ const combinedLlmsTxt = generateCombinedLlmsTxt(
382
+ config.title,
383
+ config.description,
384
+ config.details,
385
+ allSources
386
+ );
387
+
388
+ fileHierarchy[`${config.outDir}/llms.txt`] = {
389
+ content: combinedLlmsTxt,
390
+ };
391
+
392
+ return {
393
+ files: fileHierarchy,
394
+ sources: allSources,
395
+ stats: {
396
+ totalTokens,
397
+ totalPages,
398
+ totalErrors,
399
+ },
400
+ };
401
+ }
402
+
403
+ /**
404
+ * Generate combined llms.txt from all sources
405
+ * @param {string} title - Top-level title
406
+ * @param {string} description - Top-level description
407
+ * @param {string} [details] - Optional top-level details
408
+ * @param {ProcessedSource[]} allSources - All processed sources
409
+ * @returns {string} Combined llms.txt content
410
+ */
411
+ function generateCombinedLlmsTxt(title, description, details, allSources) {
412
+ let combinedTxt = `# ${title}\n\n> ${description}\n\n`;
413
+
414
+ if (details) {
415
+ combinedTxt += `${details}\n\n`;
416
+ }
417
+
418
+ for (const source of allSources) {
419
+ combinedTxt += `## ${source.title}\n\n`;
420
+
421
+ // Sort files by path for consistent ordering
422
+ const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
423
+ a.localeCompare(b)
424
+ );
425
+
426
+ for (const [path, file] of sortedFiles) {
427
+ if (file.content || file.title) {
428
+ const title = file.title || path.replace(".md", "");
429
+ const description = file.description
430
+ ? `: ${file.description.replaceAll("\n", " ")}`
431
+ : "";
432
+
433
+ // Generate link based on keepOriginalUrls and pathPrefix
434
+ let link;
435
+ if (source.keepOriginalUrls) {
436
+ link = file.originalUrl;
437
+ } else {
438
+ link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
439
+ }
440
+
441
+ combinedTxt += `- [${title}](${link})${description}\n`;
442
+ }
443
+ }
444
+
445
+ combinedTxt += "\n";
446
+ }
447
+
448
+ return combinedTxt;
449
+ }
450
+
451
+ /**
452
+ * Get path prefix for links in llms.txt
453
+ * @param {string} topLevelOutDir - Top-level output directory
454
+ * @param {string} sourceOutDir - Source-specific output directory
455
+ * @returns {string} Path prefix for links
456
+ */
457
+ function getPathPrefix(topLevelOutDir, sourceOutDir) {
458
+ // Normalize paths for comparison
459
+ const normalizeSlashes = (p) => p.replace(/\\/g, "/");
460
+ const normalizedTop = normalizeSlashes(topLevelOutDir);
461
+ const normalizedSource = normalizeSlashes(sourceOutDir);
462
+
463
+ if (normalizedSource === normalizedTop) {
464
+ return "";
465
+ }
466
+
467
+ // Calculate relative path
468
+ const topParts = normalizedTop.split("/").filter(Boolean);
469
+ const sourceParts = normalizedSource.split("/").filter(Boolean);
470
+
471
+ // Find common prefix
472
+ let commonLength = 0;
473
+ while (
474
+ commonLength < topParts.length &&
475
+ commonLength < sourceParts.length &&
476
+ topParts[commonLength] === sourceParts[commonLength]
477
+ ) {
478
+ commonLength++;
479
+ }
480
+
481
+ // Build relative path
482
+ const relativeParts = sourceParts.slice(commonLength);
483
+ return relativeParts.length > 0 ? relativeParts.join("/") : "";
484
+ }
485
+
179
486
  /**
180
487
  * Clean title by removing custom pattern if provided
181
488
  * @param {string} title - Original title
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "extract-from-sitemap",
3
3
  "bin": "cli.js",
4
- "version": "0.0.17",
4
+ "version": "0.0.19",
5
5
  "main": "mod.js",
6
6
  "description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
7
7
  "files": [