extract-from-sitemap 0.0.16 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +102 -2
- package/cli.js +52 -12
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -4,15 +4,115 @@ This repo allows you to create a static markdown bundle based on one or multiple
|
|
|
4
4
|
|
|
5
5
|
1. Create a `llmtext.json` file in the root of your project. This is where you define your sources to be extracted from. For an example combining multiple sources, see [this example](https://github.com/janwilmake/parallel-llmtext/blob/main/llmtext.json).
|
|
6
6
|
2. Run `npx extract-from-sitemap` (or add it to your `package.json` scripts, [like this](https://github.com/janwilmake/parallel-llmtext/blob/main/package.json))
|
|
7
|
-
3. Set up CI/CD in your repo to automatically update your extracted static files as often as needed.
|
|
7
|
+
3. Set up CI/CD in your repo to automatically update your extracted static files as often as needed. See [CI/CD Setup](#cicd-setup) below.
|
|
8
8
|
4. Use an agent-rewriter such as [next-agent-rewriter](../next-agent-rewriter) to rewrite agent requests to the appropriate static markdown files. In addition, it's best practice to add a link in your html to show the markdown variant is available, like this: `<link rel="alternate" type="text/markdown" href="{path}.md" title="Docs" />`
|
|
9
9
|
|
|
10
|
+
## CI/CD Setup
|
|
11
|
+
|
|
12
|
+
### GitHub Actions
|
|
13
|
+
|
|
14
|
+
1. Get your Parallel API key from [platform.parallel.ai](https://platform.parallel.ai)
|
|
15
|
+
|
|
16
|
+
2. Add it as a repository secret:
|
|
17
|
+
|
|
18
|
+
- Go to your repository → Settings → Secrets and variables → Actions
|
|
19
|
+
- Click "New repository secret"
|
|
20
|
+
- Name: `PARALLEL_API_KEY`
|
|
21
|
+
- Value: Your API key from step 1
|
|
22
|
+
|
|
23
|
+
3. Create `.github/workflows/extract-docs.yml`:
|
|
24
|
+
|
|
25
|
+
```yaml
|
|
26
|
+
name: Extract Documentation
|
|
27
|
+
|
|
28
|
+
on:
|
|
29
|
+
schedule:
|
|
30
|
+
- cron: "0 0 * * *" # Daily at midnight UTC
|
|
31
|
+
workflow_dispatch: # Allow manual trigger
|
|
32
|
+
|
|
33
|
+
jobs:
|
|
34
|
+
extract:
|
|
35
|
+
runs-on: ubuntu-latest
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/checkout@v4
|
|
38
|
+
- uses: actions/setup-node@v4
|
|
39
|
+
with:
|
|
40
|
+
node-version: "20"
|
|
41
|
+
|
|
42
|
+
- name: Extract documentation
|
|
43
|
+
env:
|
|
44
|
+
PARALLEL_API_KEY: ${{ secrets.PARALLEL_API_KEY }}
|
|
45
|
+
run: |
|
|
46
|
+
npm install -g extract-from-sitemap
|
|
47
|
+
npx extract-from-sitemap
|
|
48
|
+
|
|
49
|
+
- name: Commit changes
|
|
50
|
+
run: |
|
|
51
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
52
|
+
git config user.name "github-actions[bot]"
|
|
53
|
+
git add .
|
|
54
|
+
git diff --quiet && git diff --staged --quiet || \
|
|
55
|
+
(git commit -m "Update docs [skip ci]" && git push)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### GitLab CI
|
|
59
|
+
|
|
60
|
+
1. Add `PARALLEL_API_KEY` as a CI/CD variable:
|
|
61
|
+
|
|
62
|
+
- Go to Settings → CI/CD → Variables
|
|
63
|
+
- Add variable with your API key
|
|
64
|
+
- Make sure "Protect variable" and "Mask variable" are checked
|
|
65
|
+
|
|
66
|
+
2. Create `.gitlab-ci.yml`:
|
|
67
|
+
|
|
68
|
+
```yaml
|
|
69
|
+
extract-docs:
|
|
70
|
+
image: node:20
|
|
71
|
+
script:
|
|
72
|
+
- npm install -g extract-from-sitemap
|
|
73
|
+
- npx extract-from-sitemap
|
|
74
|
+
- |
|
|
75
|
+
git config user.email "gitlab-ci@gitlab.com"
|
|
76
|
+
git config user.name "GitLab CI"
|
|
77
|
+
git add docs/
|
|
78
|
+
git diff --quiet && git diff --staged --quiet || \
|
|
79
|
+
(git commit -m "Update docs [skip ci]" && git push https://oauth2:${CI_JOB_TOKEN}@${CI_SERVER_HOST}/${CI_PROJECT_PATH}.git HEAD:${CI_COMMIT_REF_NAME})
|
|
80
|
+
only:
|
|
81
|
+
- schedules
|
|
82
|
+
- web
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Other CI Systems
|
|
86
|
+
|
|
87
|
+
The CLI automatically detects CI environments and will require the `PARALLEL_API_KEY` environment variable to be set. It will not attempt OAuth flow in CI environments.
|
|
88
|
+
|
|
89
|
+
Supported CI detection:
|
|
90
|
+
|
|
91
|
+
- GitHub Actions
|
|
92
|
+
- GitLab CI
|
|
93
|
+
- CircleCI
|
|
94
|
+
- Travis CI
|
|
95
|
+
- Jenkins
|
|
96
|
+
- Buildkite
|
|
97
|
+
- Drone
|
|
98
|
+
- Semaphore
|
|
99
|
+
- Any system with `CI=true` or `CONTINUOUS_INTEGRATION=true`
|
|
100
|
+
|
|
10
101
|
## Known limitations
|
|
11
102
|
|
|
12
103
|
This library is in active development. Known limitations:
|
|
13
104
|
|
|
14
105
|
- Does not work for nested sitemaps
|
|
15
106
|
- Does not work on sitemaps that are too large
|
|
16
|
-
-
|
|
107
|
+
- Some CI systems may require additional git configuration
|
|
17
108
|
|
|
18
109
|
I am working on addressing these issues.
|
|
110
|
+
|
|
111
|
+
## TODO
|
|
112
|
+
|
|
113
|
+
- ✅ find auto-run and re-deploy github ci/cd rule
|
|
114
|
+
- ✅ ensure `extract-from-sitemap` requires environment variable from github ci (maybe need to run with '--ci' flag or detect somehow)
|
|
115
|
+
- set up `parallel-llmtext` to rerun every 5 minutes. if it works: every 12 hours
|
|
116
|
+
- also set up auto-deploy workflow to occur AFTER this workflow!
|
|
117
|
+
- put files in `public`
|
|
118
|
+
- add readme to `parallel-llmtext` that shows this is a template, kinda, and you can choose any other deployment method but cloudflare is preferred
|
package/cli.js
CHANGED
|
@@ -31,6 +31,25 @@ const { extractFromSitemap } = require("./mod.js");
|
|
|
31
31
|
const CREDENTIALS_DIR = path.join(os.homedir(), ".llmtext");
|
|
32
32
|
const API_KEY_FILE = path.join(CREDENTIALS_DIR, "api-key");
|
|
33
33
|
|
|
34
|
+
/**
|
|
35
|
+
* Detect if running in a CI environment
|
|
36
|
+
* @returns {boolean}
|
|
37
|
+
*/
|
|
38
|
+
function isCI() {
|
|
39
|
+
return !!(
|
|
40
|
+
process.env.CI || // Generic CI flag
|
|
41
|
+
process.env.CONTINUOUS_INTEGRATION ||
|
|
42
|
+
process.env.GITHUB_ACTIONS ||
|
|
43
|
+
process.env.GITLAB_CI ||
|
|
44
|
+
process.env.CIRCLECI ||
|
|
45
|
+
process.env.TRAVIS ||
|
|
46
|
+
process.env.JENKINS_URL ||
|
|
47
|
+
process.env.BUILDKITE ||
|
|
48
|
+
process.env.DRONE ||
|
|
49
|
+
process.env.SEMAPHORE
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
|
|
34
53
|
/**
|
|
35
54
|
* OAuth handler for Parallel.ai API key authentication
|
|
36
55
|
*/
|
|
@@ -385,13 +404,13 @@ function loadStoredApiKey() {
|
|
|
385
404
|
* @returns {Promise<string>} The API key
|
|
386
405
|
*/
|
|
387
406
|
async function getApiKey() {
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
if (
|
|
391
|
-
|
|
407
|
+
const inCI = isCI();
|
|
408
|
+
|
|
409
|
+
if (inCI) {
|
|
410
|
+
console.log("🔍 CI environment detected");
|
|
392
411
|
}
|
|
393
412
|
|
|
394
|
-
// Check environment variables
|
|
413
|
+
// Check environment variables first (most important for CI)
|
|
395
414
|
let apiKey = process.env.PARALLEL_API_KEY;
|
|
396
415
|
|
|
397
416
|
if (!apiKey && fs.existsSync(".env")) {
|
|
@@ -405,11 +424,33 @@ async function getApiKey() {
|
|
|
405
424
|
|
|
406
425
|
if (apiKey) {
|
|
407
426
|
console.log("🔑 Using API key from environment");
|
|
408
|
-
|
|
427
|
+
if (!inCI) {
|
|
428
|
+
storeApiKey(apiKey);
|
|
429
|
+
}
|
|
409
430
|
return apiKey;
|
|
410
431
|
}
|
|
411
432
|
|
|
412
|
-
//
|
|
433
|
+
// In CI environments, we cannot do OAuth - require the env var
|
|
434
|
+
if (inCI) {
|
|
435
|
+
console.error("\n❌ No API key found in CI environment!");
|
|
436
|
+
console.error("\nPlease set the PARALLEL_API_KEY environment variable:");
|
|
437
|
+
console.error(" - For GitHub Actions: Add it as a repository secret");
|
|
438
|
+
console.error(" - For GitLab CI: Add it as a CI/CD variable");
|
|
439
|
+
console.error(
|
|
440
|
+
" - For other CI systems: Add it as an environment variable"
|
|
441
|
+
);
|
|
442
|
+
console.error("\nYou can get your API key from:");
|
|
443
|
+
console.error(" https://platform.parallel.ai");
|
|
444
|
+
process.exit(1);
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// Check stored API key (only in non-CI environments)
|
|
448
|
+
const storedKey = loadStoredApiKey();
|
|
449
|
+
if (storedKey) {
|
|
450
|
+
return storedKey;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// No API key found, start OAuth flow (only in interactive environments)
|
|
413
454
|
console.log("🔑 No API key found. Starting OAuth flow...");
|
|
414
455
|
const oauth = new OAuth();
|
|
415
456
|
const newApiKey = await oauth.getApiKey();
|
|
@@ -519,7 +560,9 @@ function generateCombinedLlmsTxt(title, description, details, allSources) {
|
|
|
519
560
|
for (const [path, file] of sortedFiles) {
|
|
520
561
|
if (file.content || file.title) {
|
|
521
562
|
const title = file.title || path.replace(".md", "");
|
|
522
|
-
const description = file.description
|
|
563
|
+
const description = file.description
|
|
564
|
+
? `: ${file.description.replaceAll("\n", " ")}`
|
|
565
|
+
: "";
|
|
523
566
|
|
|
524
567
|
// Generate link based on keepOriginalUrls and pathPrefix
|
|
525
568
|
let link;
|
|
@@ -529,10 +572,7 @@ function generateCombinedLlmsTxt(title, description, details, allSources) {
|
|
|
529
572
|
link = source.pathPrefix + (path.startsWith("/") ? path : "/" + path);
|
|
530
573
|
}
|
|
531
574
|
|
|
532
|
-
combinedTxt += `- [${title}](${link})
|
|
533
|
-
"\n",
|
|
534
|
-
" "
|
|
535
|
-
)}\n`;
|
|
575
|
+
combinedTxt += `- [${title}](${link})${description}\n`;
|
|
536
576
|
}
|
|
537
577
|
}
|
|
538
578
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "extract-from-sitemap",
|
|
3
3
|
"bin": "cli.js",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.18",
|
|
5
5
|
"main": "mod.js",
|
|
6
6
|
"description": "A module and CLI that allows extracting all pages from a sitemap into markdown and a llms.txt, using Parallel.ai APIs.",
|
|
7
7
|
"files": [
|