extract-from-sitemap 0.0.17 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -2
- package/cli.js +89 -272
- package/mod.js +307 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -4,15 +4,111 @@ This repo allows you to create a static markdown bundle based on one or multiple
|
|
|
4
4
|
|
|
5
5
|
1. Create a `llmtext.json` file in the root of your project. This is where you define your sources to be extracted from. For an example combining multiple sources, see [this example](https://github.com/janwilmake/parallel-llmtext/blob/main/llmtext.json).
|
|
6
6
|
2. Run `npx extract-from-sitemap` (or add it to your `package.json` scripts, [like this](https://github.com/janwilmake/parallel-llmtext/blob/main/package.json))
|
|
7
|
-
3. Set up CI/CD in your repo to automatically update your extracted static files as often as needed.
|
|
7
|
+
3. Set up CI/CD in your repo to automatically update your extracted static files as often as needed. See [CI/CD Setup](#cicd-setup) below.
|
|
8
8
|
4. Use an agent-rewriter such as [next-agent-rewriter](../next-agent-rewriter) to rewrite agent requests to the appropriate static markdown files. In addition, it's best practice to add a link in your html to show the markdown variant is available, like this: `<link rel="alternate" type="text/markdown" href="{path}.md" title="Docs" />`
|
|
9
9
|
|
|
10
|
+
## File overview
|
|
11
|
+
|
|
12
|
+
- `mod.js` - the root to the npm pacakge to use the sitemap extraction programmatically
|
|
13
|
+
- `cli.js` - the cli usable through `npx extract-from-sitemap`. Adds functionality to have multiple sources and write from and to the file system
|
|
14
|
+
|
|
15
|
+
## CI/CD Setup
|
|
16
|
+
|
|
17
|
+
### GitHub Actions
|
|
18
|
+
|
|
19
|
+
1. Get your Parallel API key from [platform.parallel.ai](https://platform.parallel.ai)
|
|
20
|
+
|
|
21
|
+
2. Add it as a repository secret:
|
|
22
|
+
|
|
23
|
+
- Go to your repository → Settings → Secrets and variables → Actions
|
|
24
|
+
- Click "New repository secret"
|
|
25
|
+
- Name: `PARALLEL_API_KEY`
|
|
26
|
+
- Value: Your API key from step 1
|
|
27
|
+
|
|
28
|
+
3. Create `.github/workflows/extract-docs.yml`:
|
|
29
|
+
|
|
30
|
+
```yaml
|
|
31
|
+
name: Extract Documentation
|
|
32
|
+
|
|
33
|
+
on:
|
|
34
|
+
schedule:
|
|
35
|
+
- cron: "0 0 * * *" # Daily at midnight UTC
|
|
36
|
+
workflow_dispatch: # Allow manual trigger
|
|
37
|
+
|
|
38
|
+
jobs:
|
|
39
|
+
extract:
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
steps:
|
|
42
|
+
- uses: actions/checkout@v4
|
|
43
|
+
- uses: actions/setup-node@v4
|
|
44
|
+
with:
|
|
45
|
+
node-version: "20"
|
|
46
|
+
|
|
47
|
+
- name: Extract documentation
|
|
48
|
+
env:
|
|
49
|
+
PARALLEL_API_KEY: ${{ secrets.PARALLEL_API_KEY }}
|
|
50
|
+
run: |
|
|
51
|
+
npm install -g extract-from-sitemap
|
|
52
|
+
npx extract-from-sitemap
|
|
53
|
+
|
|
54
|
+
- name: Commit changes
|
|
55
|
+
run: |
|
|
56
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
57
|
+
git config user.name "github-actions[bot]"
|
|
58
|
+
git add .
|
|
59
|
+
git diff --quiet && git diff --staged --quiet || \
|
|
60
|
+
(git commit -m "Update docs [skip ci]" && git push)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### GitLab CI
|
|
64
|
+
|
|
65
|
+
1. Add `PARALLEL_API_KEY` as a CI/CD variable:
|
|
66
|
+
|
|
67
|
+
- Go to Settings → CI/CD → Variables
|
|
68
|
+
- Add variable with your API key
|
|
69
|
+
- Make sure "Protect variable" and "Mask variable" are checked
|
|
70
|
+
|
|
71
|
+
2. Create `.gitlab-ci.yml`:
|
|
72
|
+
|
|
73
|
+
```yaml
|
|
74
|
+
extract-docs:
|
|
75
|
+
image: node:20
|
|
76
|
+
script:
|
|
77
|
+
- npm install -g extract-from-sitemap
|
|
78
|
+
- npx extract-from-sitemap
|
|
79
|
+
- |
|
|
80
|
+
git config user.email "gitlab-ci@gitlab.com"
|
|
81
|
+
git config user.name "GitLab CI"
|
|
82
|
+
git add docs/
|
|
83
|
+
git diff --quiet && git diff --staged --quiet || \
|
|
84
|
+
(git commit -m "Update docs [skip ci]" && git push https://oauth2:${CI_JOB_TOKEN}@${CI_SERVER_HOST}/${CI_PROJECT_PATH}.git HEAD:${CI_COMMIT_REF_NAME})
|
|
85
|
+
only:
|
|
86
|
+
- schedules
|
|
87
|
+
- web
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Other CI Systems
|
|
91
|
+
|
|
92
|
+
The CLI automatically detects CI environments and will require the `PARALLEL_API_KEY` environment variable to be set. It will not attempt OAuth flow in CI environments.
|
|
93
|
+
|
|
94
|
+
Supported CI detection:
|
|
95
|
+
|
|
96
|
+
- GitHub Actions
|
|
97
|
+
- GitLab CI
|
|
98
|
+
- CircleCI
|
|
99
|
+
- Travis CI
|
|
100
|
+
- Jenkins
|
|
101
|
+
- Buildkite
|
|
102
|
+
- Drone
|
|
103
|
+
- Semaphore
|
|
104
|
+
- Any system with `CI=true` or `CONTINUOUS_INTEGRATION=true`
|
|
105
|
+
|
|
10
106
|
## Known limitations
|
|
11
107
|
|
|
12
108
|
This library is in active development. Known limitations:
|
|
13
109
|
|
|
14
110
|
- Does not work for nested sitemaps
|
|
15
111
|
- Does not work on sitemaps that are too large
|
|
16
|
-
-
|
|
112
|
+
- Some CI systems may require additional git configuration
|
|
17
113
|
|
|
18
114
|
I am working on addressing these issues.
|
package/cli.js
CHANGED
|
@@ -7,30 +7,30 @@ const crypto = require("crypto");
|
|
|
7
7
|
const http = require("http");
|
|
8
8
|
const { URL, URLSearchParams } = require("url");
|
|
9
9
|
const os = require("os");
|
|
10
|
-
const {
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* @typedef {Object} SourceConfig
|
|
14
|
-
* @property {string} title - The title for this source
|
|
15
|
-
* @property {string} [origin] - The origin URL to process (optional)
|
|
16
|
-
* @property {string} [outDir] - Output directory for this source's extracted files
|
|
17
|
-
* @property {boolean} [forceExtract] - Whether to force extraction for this source
|
|
18
|
-
* @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
|
|
19
|
-
* @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
|
|
20
|
-
* @property {string} [titleRemovePattern] - Regex pattern to remove from titles (case-insensitive)
|
|
21
|
-
*/
|
|
22
|
-
/**
|
|
23
|
-
* @typedef {Object} Config
|
|
24
|
-
* @property {string} title - Title of your document
|
|
25
|
-
* @property {string} description - Description of the documentation collection
|
|
26
|
-
* @property {string} [details] - Optional additional details about the collection
|
|
27
|
-
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
28
|
-
* @property {SourceConfig[]} sources - Array of source configurations
|
|
29
|
-
*/
|
|
10
|
+
const { processLLMTextConfig } = require("./mod.js");
|
|
30
11
|
|
|
31
12
|
const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
|
|
32
13
|
const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
|
|
33
14
|
|
|
15
|
+
/**
|
|
16
|
+
* Detect if running in a CI environment
|
|
17
|
+
* @returns {boolean}
|
|
18
|
+
*/
|
|
19
|
+
function isCI() {
|
|
20
|
+
return !!(
|
|
21
|
+
process.env.CI || // Generic CI flag
|
|
22
|
+
process.env.CONTINUOUS_INTEGRATION ||
|
|
23
|
+
process.env.GITHUB_ACTIONS ||
|
|
24
|
+
process.env.GITLAB_CI ||
|
|
25
|
+
process.env.CIRCLECI ||
|
|
26
|
+
process.env.TRAVIS ||
|
|
27
|
+
process.env.JENKINS_URL ||
|
|
28
|
+
process.env.BUILDKITE ||
|
|
29
|
+
process.env.DRONE ||
|
|
30
|
+
process.env.SEMAPHORE
|
|
31
|
+
);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
34
|
/**
|
|
35
35
|
* OAuth handler for Parallel.ai API key authentication
|
|
36
36
|
*/
|
|
@@ -201,7 +201,7 @@ class OAuth {
|
|
|
201
201
|
|
|
202
202
|
/**
|
|
203
203
|
* Load configuration from llmtext.json
|
|
204
|
-
* @returns {Promise<
|
|
204
|
+
* @returns {Promise<any>} The configuration object
|
|
205
205
|
*/
|
|
206
206
|
async function loadConfig() {
|
|
207
207
|
const configPath = path.resolve("llmtext.json");
|
|
@@ -385,13 +385,13 @@ function loadStoredApiKey() {
|
|
|
385
385
|
* @returns {Promise<string>} The API key
|
|
386
386
|
*/
|
|
387
387
|
async function getApiKey() {
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
if (
|
|
391
|
-
|
|
388
|
+
const inCI = isCI();
|
|
389
|
+
|
|
390
|
+
if (inCI) {
|
|
391
|
+
console.log("🔍 CI environment detected");
|
|
392
392
|
}
|
|
393
393
|
|
|
394
|
-
// Check environment variables
|
|
394
|
+
// Check environment variables first (most important for CI)
|
|
395
395
|
let apiKey = process.env.PARALLEL_API_KEY;
|
|
396
396
|
|
|
397
397
|
if (!apiKey && fs.existsSync(".env")) {
|
|
@@ -405,11 +405,33 @@ async function getApiKey() {
|
|
|
405
405
|
|
|
406
406
|
if (apiKey) {
|
|
407
407
|
console.log("🔑 Using API key from environment");
|
|
408
|
-
|
|
408
|
+
if (!inCI) {
|
|
409
|
+
storeApiKey(apiKey);
|
|
410
|
+
}
|
|
409
411
|
return apiKey;
|
|
410
412
|
}
|
|
411
413
|
|
|
412
|
-
//
|
|
414
|
+
// In CI environments, we cannot do OAuth - require the env var
|
|
415
|
+
if (inCI) {
|
|
416
|
+
console.error("\n❌ No API key found in CI environment!");
|
|
417
|
+
console.error("\nPlease set the PARALLEL_API_KEY environment variable:");
|
|
418
|
+
console.error(" - For GitHub Actions: Add it as a repository secret");
|
|
419
|
+
console.error(" - For GitLab CI: Add it as a CI/CD variable");
|
|
420
|
+
console.error(
|
|
421
|
+
" - For other CI systems: Add it as an environment variable"
|
|
422
|
+
);
|
|
423
|
+
console.error("\nYou can get your API key from:");
|
|
424
|
+
console.error(" https://platform.parallel.ai");
|
|
425
|
+
process.exit(1);
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// Check stored API key (only in non-CI environments)
|
|
429
|
+
const storedKey = loadStoredApiKey();
|
|
430
|
+
if (storedKey) {
|
|
431
|
+
return storedKey;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
// No API key found, start OAuth flow (only in interactive environments)
|
|
413
435
|
console.log("🔑 No API key found. Starting OAuth flow...");
|
|
414
436
|
const oauth = new OAuth();
|
|
415
437
|
const newApiKey = await oauth.getApiKey();
|
|
@@ -418,129 +440,6 @@ async function getApiKey() {
|
|
|
418
440
|
return newApiKey;
|
|
419
441
|
}
|
|
420
442
|
|
|
421
|
-
/**
|
|
422
|
-
* Process custom URLs through extraction API
|
|
423
|
-
* @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
|
|
424
|
-
* @param {string} apiKey - API key for authentication
|
|
425
|
-
* @returns {Promise<Record<string, any>>} Extracted files
|
|
426
|
-
*/
|
|
427
|
-
async function processCustomUrls(customUrls, apiKey) {
|
|
428
|
-
const files = {};
|
|
429
|
-
|
|
430
|
-
for (const customUrl of customUrls) {
|
|
431
|
-
console.log(`📄 Processing custom URL: ${customUrl.url}`);
|
|
432
|
-
|
|
433
|
-
try {
|
|
434
|
-
const response = await fetch("https://api.parallel.ai/v1beta/extract", {
|
|
435
|
-
method: "POST",
|
|
436
|
-
headers: {
|
|
437
|
-
"Content-Type": "application/json",
|
|
438
|
-
"parallel-beta": "search-extract-2025-10-10",
|
|
439
|
-
"x-api-key": apiKey,
|
|
440
|
-
},
|
|
441
|
-
body: JSON.stringify({
|
|
442
|
-
urls: [customUrl.url],
|
|
443
|
-
full_content: true,
|
|
444
|
-
}),
|
|
445
|
-
});
|
|
446
|
-
|
|
447
|
-
if (response.ok) {
|
|
448
|
-
const result = await response.json();
|
|
449
|
-
if (result.results && result.results.length > 0) {
|
|
450
|
-
const extracted = result.results[0];
|
|
451
|
-
const filename = customUrl.filename + ".md";
|
|
452
|
-
|
|
453
|
-
files[filename] = {
|
|
454
|
-
content: extracted.full_content || "",
|
|
455
|
-
title: customUrl.title,
|
|
456
|
-
description: customUrl.description,
|
|
457
|
-
extracted: true,
|
|
458
|
-
publishedDate: extracted.published_date || "",
|
|
459
|
-
status: 200,
|
|
460
|
-
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
461
|
-
originalUrl: customUrl.url,
|
|
462
|
-
};
|
|
463
|
-
}
|
|
464
|
-
} else {
|
|
465
|
-
throw new Error(`${response.status} - ${await response.statusText()}`);
|
|
466
|
-
}
|
|
467
|
-
} catch (error) {
|
|
468
|
-
console.error(
|
|
469
|
-
`❌ Error processing custom URL ${customUrl.url}:`,
|
|
470
|
-
error.message
|
|
471
|
-
);
|
|
472
|
-
}
|
|
473
|
-
}
|
|
474
|
-
|
|
475
|
-
return files;
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
/**
|
|
479
|
-
* Get path prefix for links in llms.txt
|
|
480
|
-
* @param {string} topLevelOutDir - Top-level output directory
|
|
481
|
-
* @param {string} sourceOutDir - Source-specific output directory
|
|
482
|
-
* @returns {string} Path prefix for links
|
|
483
|
-
*/
|
|
484
|
-
function getPathPrefix(topLevelOutDir, sourceOutDir) {
|
|
485
|
-
const resolvedTopLevel = path.resolve(topLevelOutDir);
|
|
486
|
-
const resolvedSource = path.resolve(sourceOutDir);
|
|
487
|
-
|
|
488
|
-
if (resolvedSource === resolvedTopLevel) {
|
|
489
|
-
return "";
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
const relativePath = path.relative(resolvedTopLevel, resolvedSource);
|
|
493
|
-
return relativePath || "";
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
/**
|
|
497
|
-
* Generate combined llms.txt from all sources
|
|
498
|
-
* @param {string} title - Top-level title
|
|
499
|
-
* @param {string} description - Top-level description
|
|
500
|
-
* @param {string} [details] - Optional top-level details
|
|
501
|
-
* @param {Array<{title: string, files: Record<string, any>, keepOriginalUrls?: boolean, pathPrefix: string}>} allSources - All processed sources
|
|
502
|
-
* @returns {string} Combined llms.txt content
|
|
503
|
-
*/
|
|
504
|
-
function generateCombinedLlmsTxt(title, description, details, allSources) {
|
|
505
|
-
let combinedTxt = `# ${title}\n\n> ${description}\n\n`;
|
|
506
|
-
|
|
507
|
-
if (details) {
|
|
508
|
-
combinedTxt += `${details}\n\n`;
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
for (const source of allSources) {
|
|
512
|
-
combinedTxt += `## ${source.title}\n\n`;
|
|
513
|
-
|
|
514
|
-
// Sort files by path for consistent ordering
|
|
515
|
-
const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
|
|
516
|
-
a.localeCompare(b)
|
|
517
|
-
);
|
|
518
|
-
|
|
519
|
-
for (const [path, file] of sortedFiles) {
|
|
520
|
-
if (file.content || file.title) {
|
|
521
|
-
const title = file.title || path.replace(".md", "");
|
|
522
|
-
const description = file.description
|
|
523
|
-
? `: ${file.description.replaceAll("\n", " ")}`
|
|
524
|
-
: "";
|
|
525
|
-
|
|
526
|
-
// Generate link based on keepOriginalUrls and pathPrefix
|
|
527
|
-
let link;
|
|
528
|
-
if (source.keepOriginalUrls) {
|
|
529
|
-
link = file.originalUrl;
|
|
530
|
-
} else {
|
|
531
|
-
link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
combinedTxt += `- [${title}](${link})${description}\n`;
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
|
|
538
|
-
combinedTxt += "\n";
|
|
539
|
-
}
|
|
540
|
-
|
|
541
|
-
return combinedTxt;
|
|
542
|
-
}
|
|
543
|
-
|
|
544
443
|
/**
|
|
545
444
|
* Clear stored API key credentials
|
|
546
445
|
*/
|
|
@@ -557,6 +456,33 @@ async function clearCredentials() {
|
|
|
557
456
|
}
|
|
558
457
|
}
|
|
559
458
|
|
|
459
|
+
/**
|
|
460
|
+
* Write file hierarchy to disk
|
|
461
|
+
* @param {Record<string, {content?: string, error?: string}>} fileHierarchy - File hierarchy to write
|
|
462
|
+
*/
|
|
463
|
+
function writeFileHierarchy(fileHierarchy) {
|
|
464
|
+
for (const [filePath, item] of Object.entries(fileHierarchy)) {
|
|
465
|
+
try {
|
|
466
|
+
const resolvedPath = path.resolve(filePath);
|
|
467
|
+
const fileDir = path.dirname(resolvedPath);
|
|
468
|
+
|
|
469
|
+
// Create directory if it doesn't exist
|
|
470
|
+
fs.mkdirSync(fileDir, { recursive: true });
|
|
471
|
+
|
|
472
|
+
if (item.content) {
|
|
473
|
+
fs.writeFileSync(resolvedPath, item.content);
|
|
474
|
+
console.log(`📝 Wrote: ${filePath}`);
|
|
475
|
+
} else if (item.error) {
|
|
476
|
+
console.error(`❌ Error for ${filePath}: ${item.error}`);
|
|
477
|
+
}
|
|
478
|
+
} catch (error) {
|
|
479
|
+
console.error(
|
|
480
|
+
`❌ Failed to write ${filePath}: ${error.message || "Unknown error"}`
|
|
481
|
+
);
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
560
486
|
/**
|
|
561
487
|
* Main function
|
|
562
488
|
*/
|
|
@@ -574,131 +500,22 @@ async function main() {
|
|
|
574
500
|
const config = await loadConfig();
|
|
575
501
|
const apiKey = await getApiKey();
|
|
576
502
|
|
|
577
|
-
|
|
578
|
-
fs.mkdirSync(config.outDir, { recursive: true });
|
|
579
|
-
|
|
580
|
-
const allSources = [];
|
|
581
|
-
let totalTokens = 0;
|
|
582
|
-
let totalPages = 0;
|
|
583
|
-
let totalErrors = 0;
|
|
584
|
-
|
|
585
|
-
// Process each source
|
|
586
|
-
for (const [sourceIndex, sourceConfig] of config.sources.entries()) {
|
|
587
|
-
const sourceName = `${sourceConfig.title} (source ${sourceIndex + 1})`;
|
|
588
|
-
|
|
589
|
-
console.log(
|
|
590
|
-
`\n🌐 Processing ${sourceName} (forceExtract: ${sourceConfig.forceExtract}, keepOriginalUrls: ${sourceConfig.keepOriginalUrls})`
|
|
591
|
-
);
|
|
592
|
-
|
|
593
|
-
// Ensure source output directory exists (if not keeping original URLs)
|
|
594
|
-
if (!sourceConfig.keepOriginalUrls) {
|
|
595
|
-
fs.mkdirSync(sourceConfig.outDir, { recursive: true });
|
|
596
|
-
}
|
|
597
|
-
|
|
598
|
-
let sourceFiles = {};
|
|
599
|
-
|
|
600
|
-
try {
|
|
601
|
-
// Process origin if provided
|
|
602
|
-
if (sourceConfig.origin) {
|
|
603
|
-
const result = await extractFromSitemap(
|
|
604
|
-
sourceConfig.origin,
|
|
605
|
-
sourceConfig.forceExtract,
|
|
606
|
-
apiKey,
|
|
607
|
-
sourceConfig.titleRemovePattern
|
|
608
|
-
);
|
|
609
|
-
|
|
610
|
-
console.log(
|
|
611
|
-
`✅ Extracted ${result.totalPages} pages with ${result.totalTokens} tokens`
|
|
612
|
-
);
|
|
613
|
-
if (result.errors > 0) {
|
|
614
|
-
console.log(`⚠️ ${result.errors} errors occurred`);
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
sourceFiles = result.files;
|
|
618
|
-
totalTokens += result.totalTokens;
|
|
619
|
-
totalPages += result.totalPages;
|
|
620
|
-
totalErrors += result.errors;
|
|
621
|
-
}
|
|
622
|
-
|
|
623
|
-
// Process custom URLs for this source
|
|
624
|
-
if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
|
|
625
|
-
console.log(
|
|
626
|
-
`📋 Processing ${sourceConfig.customUrls.length} custom URLs for this source...`
|
|
627
|
-
);
|
|
628
|
-
const customFiles = await processCustomUrls(
|
|
629
|
-
sourceConfig.customUrls,
|
|
630
|
-
apiKey
|
|
631
|
-
);
|
|
632
|
-
|
|
633
|
-
// Merge custom files with sitemap files
|
|
634
|
-
sourceFiles = { ...sourceFiles, ...customFiles };
|
|
635
|
-
|
|
636
|
-
for (const file of Object.values(customFiles)) {
|
|
637
|
-
totalTokens += file.tokens;
|
|
638
|
-
totalPages++;
|
|
639
|
-
}
|
|
640
|
-
}
|
|
503
|
+
console.log("\n🔄 Processing LLMText configuration...");
|
|
641
504
|
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
for (const [filePath, file] of Object.entries(sourceFiles)) {
|
|
645
|
-
let filename = filePath.startsWith("/")
|
|
646
|
-
? filePath.slice(1)
|
|
647
|
-
: filePath;
|
|
505
|
+
// Process the entire config using the new function
|
|
506
|
+
const result = await processLLMTextConfig(config, apiKey);
|
|
648
507
|
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
fs.mkdirSync(fileDir, { recursive: true });
|
|
653
|
-
fs.writeFileSync(fullFilePath, file.content);
|
|
654
|
-
|
|
655
|
-
console.log(
|
|
656
|
-
`📝 Wrote: ${path.join(sourceConfig.outDir, filename)} (${
|
|
657
|
-
file.tokens
|
|
658
|
-
} tokens)`
|
|
659
|
-
);
|
|
660
|
-
}
|
|
661
|
-
} else {
|
|
662
|
-
console.log(
|
|
663
|
-
`📋 Keeping original URLs - not saving files locally for ${sourceName}`
|
|
664
|
-
);
|
|
665
|
-
}
|
|
666
|
-
|
|
667
|
-
// Calculate path prefix for this source
|
|
668
|
-
const pathPrefix = sourceConfig.keepOriginalUrls
|
|
669
|
-
? ""
|
|
670
|
-
: getPathPrefix(config.outDir, sourceConfig.outDir);
|
|
671
|
-
|
|
672
|
-
// Add to all sources for combined llms.txt
|
|
673
|
-
allSources.push({
|
|
674
|
-
title: sourceConfig.title,
|
|
675
|
-
files: sourceFiles,
|
|
676
|
-
keepOriginalUrls: sourceConfig.keepOriginalUrls,
|
|
677
|
-
pathPrefix: pathPrefix,
|
|
678
|
-
});
|
|
679
|
-
} catch (error) {
|
|
680
|
-
console.error(`❌ Error processing ${sourceName}:`, error.message);
|
|
681
|
-
totalErrors++;
|
|
682
|
-
}
|
|
683
|
-
}
|
|
684
|
-
|
|
685
|
-
// Generate and write combined llms.txt to top-level outDir
|
|
686
|
-
if (allSources.length > 0) {
|
|
687
|
-
const combinedLlmsTxt = generateCombinedLlmsTxt(
|
|
688
|
-
config.title,
|
|
689
|
-
config.description,
|
|
690
|
-
config.details,
|
|
691
|
-
allSources
|
|
692
|
-
);
|
|
693
|
-
const combinedLlmsTxtPath = path.join(config.outDir, "llms.txt");
|
|
694
|
-
fs.writeFileSync(combinedLlmsTxtPath, combinedLlmsTxt);
|
|
695
|
-
console.log(`\n📋 Generated combined llms.txt: ${combinedLlmsTxtPath}`);
|
|
696
|
-
}
|
|
508
|
+
// Write all files to disk
|
|
509
|
+
console.log("\n📁 Writing files to disk...");
|
|
510
|
+
writeFileHierarchy(result.files);
|
|
697
511
|
|
|
512
|
+
// Print summary
|
|
698
513
|
console.log("\n✨ Extraction completed!");
|
|
699
|
-
console.log(
|
|
700
|
-
|
|
701
|
-
|
|
514
|
+
console.log(
|
|
515
|
+
`📊 Total: ${result.stats.totalPages} pages, ${result.stats.totalTokens} tokens`
|
|
516
|
+
);
|
|
517
|
+
if (result.stats.totalErrors > 0) {
|
|
518
|
+
console.log(`⚠️ Errors: ${result.stats.totalErrors}`);
|
|
702
519
|
}
|
|
703
520
|
console.log(
|
|
704
521
|
`📁 Top-level output directory: ${path.resolve(config.outDir)}`
|
package/mod.js
CHANGED
|
@@ -22,6 +22,40 @@
|
|
|
22
22
|
* @property {number} fetchCount - Number of fetch operations performed
|
|
23
23
|
*/
|
|
24
24
|
|
|
25
|
+
/**
|
|
26
|
+
* @typedef {Object} SourceConfig
|
|
27
|
+
* @property {string} title - The title for this source
|
|
28
|
+
* @property {string} [origin] - The origin URL to process (optional)
|
|
29
|
+
* @property {string} [outDir] - Output directory for this source's extracted files
|
|
30
|
+
* @property {boolean} [forceExtract] - Whether to force extraction for this source
|
|
31
|
+
* @property {boolean} [keepOriginalUrls] - Whether to keep original URL structure and not save files locally
|
|
32
|
+
* @property {Array<{title: string, description: string, filename: string, url: string}>} [customUrls] - Custom URLs to extract for this source
|
|
33
|
+
* @property {string} [titleRemovePattern] - Regex pattern to remove from titles (case-insensitive)
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* @typedef {Object} LLMTextConfig
|
|
38
|
+
* @property {string} title - Title of your document
|
|
39
|
+
* @property {string} description - Description of the documentation collection
|
|
40
|
+
* @property {string} [details] - Optional additional details about the collection
|
|
41
|
+
* @property {string} outDir - Top-level output directory for combined llms.txt
|
|
42
|
+
* @property {SourceConfig[]} sources - Array of source configurations
|
|
43
|
+
*/
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* @typedef {Object} FileHierarchyItem
|
|
47
|
+
* @property {string} [content] - File content if successful
|
|
48
|
+
* @property {string} [error] - Error message if failed
|
|
49
|
+
*/
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* @typedef {Object} ProcessedSource
|
|
53
|
+
* @property {string} title - Source title
|
|
54
|
+
* @property {Record<string, FileResult>} files - Extracted files
|
|
55
|
+
* @property {boolean} keepOriginalUrls - Whether to keep original URLs
|
|
56
|
+
* @property {string} pathPrefix - Path prefix for links
|
|
57
|
+
*/
|
|
58
|
+
|
|
25
59
|
/**
|
|
26
60
|
* Extract content from sitemap URLs with markdown variant detection
|
|
27
61
|
* @param {string} origin - The origin URL to extract from
|
|
@@ -176,6 +210,279 @@ export async function extractFromSitemap(
|
|
|
176
210
|
};
|
|
177
211
|
}
|
|
178
212
|
|
|
213
|
+
/**
|
|
214
|
+
* Process custom URLs through extraction API
|
|
215
|
+
* @param {Array<{title: string, description: string, filename: string, url: string}>} customUrls - Custom URLs to process
|
|
216
|
+
* @param {string} apiKey - API key for authentication
|
|
217
|
+
* @returns {Promise<Record<string, FileResult>>} Extracted files
|
|
218
|
+
*/
|
|
219
|
+
export async function processCustomUrls(customUrls, apiKey) {
|
|
220
|
+
const files = {};
|
|
221
|
+
|
|
222
|
+
for (const customUrl of customUrls) {
|
|
223
|
+
try {
|
|
224
|
+
const response = await fetch("https://api.parallel.ai/v1beta/extract", {
|
|
225
|
+
method: "POST",
|
|
226
|
+
headers: {
|
|
227
|
+
"Content-Type": "application/json",
|
|
228
|
+
"parallel-beta": "search-extract-2025-10-10",
|
|
229
|
+
"x-api-key": apiKey,
|
|
230
|
+
},
|
|
231
|
+
body: JSON.stringify({
|
|
232
|
+
urls: [customUrl.url],
|
|
233
|
+
full_content: true,
|
|
234
|
+
}),
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
if (response.ok) {
|
|
238
|
+
const result = await response.json();
|
|
239
|
+
if (result.results && result.results.length > 0) {
|
|
240
|
+
const extracted = result.results[0];
|
|
241
|
+
const filename = customUrl.filename + ".md";
|
|
242
|
+
|
|
243
|
+
files[filename] = {
|
|
244
|
+
content: extracted.full_content || "",
|
|
245
|
+
title: customUrl.title,
|
|
246
|
+
description: customUrl.description,
|
|
247
|
+
extracted: true,
|
|
248
|
+
publishedDate: extracted.published_date || "",
|
|
249
|
+
status: 200,
|
|
250
|
+
tokens: Math.round((extracted.full_content || "").length / 5),
|
|
251
|
+
originalUrl: customUrl.url,
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
} else {
|
|
255
|
+
throw new Error(`${response.status} - ${await response.statusText()}`);
|
|
256
|
+
}
|
|
257
|
+
} catch (error) {
|
|
258
|
+
const filename = customUrl.filename + ".md";
|
|
259
|
+
files[filename] = {
|
|
260
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
261
|
+
content: "",
|
|
262
|
+
title: customUrl.title,
|
|
263
|
+
description: customUrl.description,
|
|
264
|
+
extracted: false,
|
|
265
|
+
status: 0,
|
|
266
|
+
tokens: 0,
|
|
267
|
+
publishedDate: "",
|
|
268
|
+
originalUrl: customUrl.url,
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
return files;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Process LLMText config and generate file hierarchy
|
|
278
|
+
* @param {LLMTextConfig} config - The LLMText configuration
|
|
279
|
+
* @param {string} apiKey - Parallel API key
|
|
280
|
+
* @returns {Promise<{files: Record<string, FileHierarchyItem>, sources: ProcessedSource[], stats: {totalTokens: number, totalPages: number, totalErrors: number}}>}
|
|
281
|
+
*/
|
|
282
|
+
export async function processLLMTextConfig(config, apiKey) {
|
|
283
|
+
const allSources = [];
|
|
284
|
+
let totalTokens = 0;
|
|
285
|
+
let totalPages = 0;
|
|
286
|
+
let totalErrors = 0;
|
|
287
|
+
|
|
288
|
+
// Process each source
|
|
289
|
+
for (const sourceConfig of config.sources) {
|
|
290
|
+
let sourceFiles = {};
|
|
291
|
+
|
|
292
|
+
try {
|
|
293
|
+
// Process origin if provided
|
|
294
|
+
if (sourceConfig.origin) {
|
|
295
|
+
const result = await extractFromSitemap(
|
|
296
|
+
sourceConfig.origin,
|
|
297
|
+
sourceConfig.forceExtract || false,
|
|
298
|
+
apiKey,
|
|
299
|
+
sourceConfig.titleRemovePattern
|
|
300
|
+
);
|
|
301
|
+
|
|
302
|
+
sourceFiles = result.files;
|
|
303
|
+
totalTokens += result.totalTokens;
|
|
304
|
+
totalPages += result.totalPages;
|
|
305
|
+
totalErrors += result.errors;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Process custom URLs for this source
|
|
309
|
+
if (sourceConfig.customUrls && sourceConfig.customUrls.length > 0) {
|
|
310
|
+
const customFiles = await processCustomUrls(
|
|
311
|
+
sourceConfig.customUrls,
|
|
312
|
+
apiKey
|
|
313
|
+
);
|
|
314
|
+
|
|
315
|
+
// Merge custom files with sitemap files
|
|
316
|
+
sourceFiles = { ...sourceFiles, ...customFiles };
|
|
317
|
+
|
|
318
|
+
for (const file of Object.values(customFiles)) {
|
|
319
|
+
totalTokens += file.tokens;
|
|
320
|
+
totalPages++;
|
|
321
|
+
if (file.error) totalErrors++;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Calculate path prefix for this source
|
|
326
|
+
const pathPrefix = sourceConfig.keepOriginalUrls
|
|
327
|
+
? ""
|
|
328
|
+
: getPathPrefix(config.outDir, sourceConfig.outDir || config.outDir);
|
|
329
|
+
|
|
330
|
+
// Add to all sources
|
|
331
|
+
allSources.push({
|
|
332
|
+
title: sourceConfig.title,
|
|
333
|
+
files: sourceFiles,
|
|
334
|
+
keepOriginalUrls: sourceConfig.keepOriginalUrls || false,
|
|
335
|
+
pathPrefix: pathPrefix,
|
|
336
|
+
outDir: sourceConfig.outDir || config.outDir,
|
|
337
|
+
});
|
|
338
|
+
} catch (error) {
|
|
339
|
+
totalErrors++;
|
|
340
|
+
// Add empty source with error
|
|
341
|
+
allSources.push({
|
|
342
|
+
title: sourceConfig.title,
|
|
343
|
+
files: {
|
|
344
|
+
error: {
|
|
345
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
346
|
+
content: "",
|
|
347
|
+
title: "",
|
|
348
|
+
description: "",
|
|
349
|
+
extracted: false,
|
|
350
|
+
status: 0,
|
|
351
|
+
tokens: 0,
|
|
352
|
+
publishedDate: "",
|
|
353
|
+
originalUrl: "",
|
|
354
|
+
},
|
|
355
|
+
},
|
|
356
|
+
keepOriginalUrls: sourceConfig.keepOriginalUrls || false,
|
|
357
|
+
pathPrefix: "",
|
|
358
|
+
outDir: sourceConfig.outDir || config.outDir,
|
|
359
|
+
});
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Generate file hierarchy
|
|
364
|
+
const fileHierarchy = {};
|
|
365
|
+
|
|
366
|
+
// Add source files
|
|
367
|
+
for (const source of allSources) {
|
|
368
|
+
if (!source.keepOriginalUrls) {
|
|
369
|
+
for (const [filePath, file] of Object.entries(source.files)) {
|
|
370
|
+
let filename = filePath.startsWith("/") ? filePath.slice(1) : filePath;
|
|
371
|
+
const fullPath = `${source.outDir}/${filename}`;
|
|
372
|
+
|
|
373
|
+
fileHierarchy[fullPath] = file.error
|
|
374
|
+
? { error: file.error }
|
|
375
|
+
: { content: file.content };
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Generate combined llms.txt
|
|
381
|
+
const combinedLlmsTxt = generateCombinedLlmsTxt(
|
|
382
|
+
config.title,
|
|
383
|
+
config.description,
|
|
384
|
+
config.details,
|
|
385
|
+
allSources
|
|
386
|
+
);
|
|
387
|
+
|
|
388
|
+
fileHierarchy[`${config.outDir}/llms.txt`] = {
|
|
389
|
+
content: combinedLlmsTxt,
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
return {
|
|
393
|
+
files: fileHierarchy,
|
|
394
|
+
sources: allSources,
|
|
395
|
+
stats: {
|
|
396
|
+
totalTokens,
|
|
397
|
+
totalPages,
|
|
398
|
+
totalErrors,
|
|
399
|
+
},
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/**
|
|
404
|
+
* Generate combined llms.txt from all sources
|
|
405
|
+
* @param {string} title - Top-level title
|
|
406
|
+
* @param {string} description - Top-level description
|
|
407
|
+
* @param {string} [details] - Optional top-level details
|
|
408
|
+
* @param {ProcessedSource[]} allSources - All processed sources
|
|
409
|
+
* @returns {string} Combined llms.txt content
|
|
410
|
+
*/
|
|
411
|
+
function generateCombinedLlmsTxt(title, description, details, allSources) {
|
|
412
|
+
let combinedTxt = `# ${title}\n\n> ${description}\n\n`;
|
|
413
|
+
|
|
414
|
+
if (details) {
|
|
415
|
+
combinedTxt += `${details}\n\n`;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
for (const source of allSources) {
|
|
419
|
+
combinedTxt += `## ${source.title}\n\n`;
|
|
420
|
+
|
|
421
|
+
// Sort files by path for consistent ordering
|
|
422
|
+
const sortedFiles = Object.entries(source.files).sort(([a], [b]) =>
|
|
423
|
+
a.localeCompare(b)
|
|
424
|
+
);
|
|
425
|
+
|
|
426
|
+
for (const [path, file] of sortedFiles) {
|
|
427
|
+
if (file.content || file.title) {
|
|
428
|
+
const title = file.title || path.replace(".md", "");
|
|
429
|
+
const description = file.description
|
|
430
|
+
? `: ${file.description.replaceAll("\n", " ")}`
|
|
431
|
+
: "";
|
|
432
|
+
|
|
433
|
+
// Generate link based on keepOriginalUrls and pathPrefix
|
|
434
|
+
let link;
|
|
435
|
+
if (source.keepOriginalUrls) {
|
|
436
|
+
link = file.originalUrl;
|
|
437
|
+
} else {
|
|
438
|
+
link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
combinedTxt += `- [${title}](${link})${description}\n`;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
combinedTxt += "\n";
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
return combinedTxt;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/**
|
|
452
|
+
* Get path prefix for links in llms.txt
|
|
453
|
+
* @param {string} topLevelOutDir - Top-level output directory
|
|
454
|
+
* @param {string} sourceOutDir - Source-specific output directory
|
|
455
|
+
* @returns {string} Path prefix for links
|
|
456
|
+
*/
|
|
457
|
+
function getPathPrefix(topLevelOutDir, sourceOutDir) {
|
|
458
|
+
// Normalize paths for comparison
|
|
459
|
+
const normalizeSlashes = (p) => p.replace(/\\/g, "/");
|
|
460
|
+
const normalizedTop = normalizeSlashes(topLevelOutDir);
|
|
461
|
+
const normalizedSource = normalizeSlashes(sourceOutDir);
|
|
462
|
+
|
|
463
|
+
if (normalizedSource === normalizedTop) {
|
|
464
|
+
return "";
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// Calculate relative path
|
|
468
|
+
const topParts = normalizedTop.split("/").filter(Boolean);
|
|
469
|
+
const sourceParts = normalizedSource.split("/").filter(Boolean);
|
|
470
|
+
|
|
471
|
+
// Find common prefix
|
|
472
|
+
let commonLength = 0;
|
|
473
|
+
while (
|
|
474
|
+
commonLength < topParts.length &&
|
|
475
|
+
commonLength < sourceParts.length &&
|
|
476
|
+
topParts[commonLength] === sourceParts[commonLength]
|
|
477
|
+
) {
|
|
478
|
+
commonLength++;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// Build relative path
|
|
482
|
+
const relativeParts = sourceParts.slice(commonLength);
|
|
483
|
+
return relativeParts.length > 0 ? relativeParts.join("/") : "";
|
|
484
|
+
}
|
|
485
|
+
|
|
179
486
|
/**
|
|
180
487
|
* Clean title by removing custom pattern if provided
|
|
181
488
|
* @param {string} title - Original title
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.19",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|