liferay-docs-scraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- liferay_docs_scraper-0.1.0/.gitignore +7 -0
- liferay_docs_scraper-0.1.0/PKG-INFO +128 -0
- liferay_docs_scraper-0.1.0/README.md +120 -0
- liferay_docs_scraper-0.1.0/docs/adr/0001-crawl4ai-based-corpus-pipeline.md +312 -0
- liferay_docs_scraper-0.1.0/pyproject.toml +20 -0
- liferay_docs_scraper-0.1.0/reports/low_value_candidates.md +369 -0
- liferay_docs_scraper-0.1.0/skills/liferay-expert/SKILL.md +85 -0
- liferay_docs_scraper-0.1.0/src/liferay_docs_scraper/__init__.py +0 -0
- liferay_docs_scraper-0.1.0/src/liferay_docs_scraper/check_regressions.py +105 -0
- liferay_docs_scraper-0.1.0/src/liferay_docs_scraper/classify_pages.py +41 -0
- liferay_docs_scraper-0.1.0/src/liferay_docs_scraper/filter_urls.py +151 -0
- liferay_docs_scraper-0.1.0/src/liferay_docs_scraper/pipeline.py +451 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: liferay-docs-scraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scrape learn.liferay.com/w/dxp into a local Markdown corpus (raw/{capability}/*.md) for the liferay-expert Claude Code skill.
|
|
5
|
+
Requires-Python: <3.14,>=3.10
|
|
6
|
+
Requires-Dist: crawl4ai>=0.9.0
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
|
|
9
|
+
# liferay-docs-scraper
|
|
10
|
+
|
|
11
|
+
Scrapes `learn.liferay.com/w/dxp/*` into a local, clean Markdown corpus
|
|
12
|
+
(`raw/{capability}/*.md`) and ships a Claude Code skill (`liferay-expert`)
|
|
13
|
+
that answers Liferay DXP questions by searching and citing that corpus.
|
|
14
|
+
|
|
15
|
+
**This repo does not ship Liferay's documentation.** It ships the code that
|
|
16
|
+
scrapes it, and a skill that reads whatever you scrape locally. Each user
|
|
17
|
+
builds and refreshes their own copy directly from learn.liferay.com.
|
|
18
|
+
|
|
19
|
+
## Quickstart
|
|
20
|
+
|
|
21
|
+
The recommended order for a first-time setup: scrape, then install the
|
|
22
|
+
skill, then ask questions.
|
|
23
|
+
|
|
24
|
+
**1. Build the corpus (one-time, ~30-40 min):**
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
uvx --from crawl4ai crawl4ai-setup # one-time, installs Playwright browsers
|
|
28
|
+
uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Run this from anywhere -- it does not write into your current directory,
|
|
32
|
+
see "Reference: the scraper in detail" below for exactly where it goes.
|
|
33
|
+
|
|
34
|
+
**2. Install the skill into whatever project you're working in:**
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
npx skills add mordonez/liferay-docs-scraper --skill liferay-expert -a claude-code
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
You'll see:
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
◇ Installed 1 skill ───────────────────╮
|
|
44
|
+
│ │
|
|
45
|
+
│ ✓ liferay-expert (copied) │
|
|
46
|
+
│ → ./.claude/skills/liferay-expert │
|
|
47
|
+
│ │
|
|
48
|
+
├───────────────────────────────────────╯
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**3. Ask Claude Code a Liferay question**, e.g. "how do I configure a
|
|
52
|
+
synonym set in Liferay search?" The skill finds the corpus, greps the
|
|
53
|
+
`search` capability, reads `search-administration-and-tuning-synonym-sets.md`,
|
|
54
|
+
and answers grounded in that page -- citing
|
|
55
|
+
`https://learn.liferay.com/w/dxp/search/search-administration-and-tuning/synonym-sets`
|
|
56
|
+
as the source.
|
|
57
|
+
|
|
58
|
+
The corpus is shared across every project where you install the skill (see
|
|
59
|
+
"OS default location" below), so step 1 is only ever needed once per
|
|
60
|
+
machine -- rerun it later just to refresh, not per-project.
|
|
61
|
+
|
|
62
|
+
**If you install the skill without doing step 1 first** (or its corpus goes
|
|
63
|
+
stale), it notices and tells you what to run rather than guessing or
|
|
64
|
+
answering ungrounded -- it never launches the ~30-40 min scrape on its own
|
|
65
|
+
mid-conversation. See "Step 1/2" in `skills/liferay-expert/SKILL.md` for
|
|
66
|
+
that check.
|
|
67
|
+
|
|
68
|
+
## Reference: the scraper in detail
|
|
69
|
+
|
|
70
|
+
Requires Python 3.10-3.13 (crawl4ai's Playwright dependency doesn't yet
|
|
71
|
+
support 3.14) and [uv](https://docs.astral.sh/uv/).
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# One-time: installs the Playwright/Chromium browser crawl4ai drives
|
|
75
|
+
uvx --from crawl4ai crawl4ai-setup
|
|
76
|
+
|
|
77
|
+
# From anywhere -- the corpus does NOT go in your current directory.
|
|
78
|
+
# Not on PyPI yet, so install straight from GitHub:
|
|
79
|
+
uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This takes roughly 30-40 minutes (BFS deep crawl of ~1900 pages across 14
|
|
83
|
+
capabilities) and writes to one shared, per-user location (so it's the same
|
|
84
|
+
corpus no matter which project you're in when the skill looks for it):
|
|
85
|
+
|
|
86
|
+
| OS | Default location |
|
|
87
|
+
|---|---|
|
|
88
|
+
| macOS | `~/Library/Application Support/liferay-docs/` |
|
|
89
|
+
| Linux | `~/.local/share/liferay-docs/` (or `$XDG_DATA_HOME/liferay-docs`) |
|
|
90
|
+
| Windows | `%LOCALAPPDATA%\liferay-docs\` |
|
|
91
|
+
|
|
92
|
+
Set `LIFERAY_DOCS_DIR` to override (e.g. to keep a project-local copy instead).
|
|
93
|
+
|
|
94
|
+
Inside that directory:
|
|
95
|
+
|
|
96
|
+
- `raw/{capability}/*.md` — the corpus, one file per page
|
|
97
|
+
- `raw/_navigation/{capability}/*.md` — pure TOC pages, kept but deprioritized
|
|
98
|
+
- `raw/_removed/{capability}/*.md` — pages confirmed gone from the live site
|
|
99
|
+
- `reports/filtered/` — URL manifests, self-hosted prune log, run summary
|
|
100
|
+
|
|
101
|
+
Re-run it anytime (weekly recommended) to refresh: it starts from zero every
|
|
102
|
+
time, so it naturally picks up new pages, updates changed ones, and
|
|
103
|
+
quarantines (never deletes) removed ones. If that directory is (or becomes)
|
|
104
|
+
a git repo -- worth doing once, purely as a local diffing tool, nothing needs
|
|
105
|
+
pushing anywhere -- it also runs `check-regressions` automatically afterward
|
|
106
|
+
and flags any file that shrank by more than half or grew more than 3x versus
|
|
107
|
+
the last commit (signals of a broken fetch); see
|
|
108
|
+
`docs/adr/0001-crawl4ai-based-corpus-pipeline.md` for why that check exists.
|
|
109
|
+
|
|
110
|
+
## Reference: the skill in detail
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
npx skills add mordonez/liferay-docs-scraper --skill liferay-expert
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Or just copy `skills/liferay-expert/SKILL.md` into `.claude/skills/liferay-expert/`
|
|
117
|
+
in any project. Claude Code picks it up automatically; the skill itself
|
|
118
|
+
resolves `$LIFERAY_DOCS_DIR` (or the OS default above) to find the corpus,
|
|
119
|
+
so it works the same regardless of which project you installed it into.
|
|
120
|
+
|
|
121
|
+
## Why no bundled docs, no embeddings, no vector DB
|
|
122
|
+
|
|
123
|
+
See `docs/adr/` for the full reasoning. Short version: the corpus is
|
|
124
|
+
Liferay's copyrighted documentation text -- distributing the *tool* that
|
|
125
|
+
scrapes public pages is a different, much lower-risk thing than a third
|
|
126
|
+
party redistributing that text at scale. Plain grep + Read over ~1800
|
|
127
|
+
well-organized Markdown files is fast enough that no search index is needed;
|
|
128
|
+
add one later if that stops being true.
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# liferay-docs-scraper
|
|
2
|
+
|
|
3
|
+
Scrapes `learn.liferay.com/w/dxp/*` into a local, clean Markdown corpus
|
|
4
|
+
(`raw/{capability}/*.md`) and ships a Claude Code skill (`liferay-expert`)
|
|
5
|
+
that answers Liferay DXP questions by searching and citing that corpus.
|
|
6
|
+
|
|
7
|
+
**This repo does not ship Liferay's documentation.** It ships the code that
|
|
8
|
+
scrapes it, and a skill that reads whatever you scrape locally. Each user
|
|
9
|
+
builds and refreshes their own copy directly from learn.liferay.com.
|
|
10
|
+
|
|
11
|
+
## Quickstart
|
|
12
|
+
|
|
13
|
+
The recommended order for a first-time setup: scrape, then install the
|
|
14
|
+
skill, then ask questions.
|
|
15
|
+
|
|
16
|
+
**1. Build the corpus (one-time, ~30-40 min):**
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
uvx --from crawl4ai crawl4ai-setup # one-time, installs Playwright browsers
|
|
20
|
+
uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Run this from anywhere -- it does not write into your current directory,
|
|
24
|
+
see "Reference: the scraper in detail" below for exactly where it goes.
|
|
25
|
+
|
|
26
|
+
**2. Install the skill into whatever project you're working in:**
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
npx skills add mordonez/liferay-docs-scraper --skill liferay-expert -a claude-code
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
You'll see:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
◇ Installed 1 skill ───────────────────╮
|
|
36
|
+
│ │
|
|
37
|
+
│ ✓ liferay-expert (copied) │
|
|
38
|
+
│ → ./.claude/skills/liferay-expert │
|
|
39
|
+
│ │
|
|
40
|
+
├───────────────────────────────────────╯
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**3. Ask Claude Code a Liferay question**, e.g. "how do I configure a
|
|
44
|
+
synonym set in Liferay search?" The skill finds the corpus, greps the
|
|
45
|
+
`search` capability, reads `search-administration-and-tuning-synonym-sets.md`,
|
|
46
|
+
and answers grounded in that page -- citing
|
|
47
|
+
`https://learn.liferay.com/w/dxp/search/search-administration-and-tuning/synonym-sets`
|
|
48
|
+
as the source.
|
|
49
|
+
|
|
50
|
+
The corpus is shared across every project where you install the skill (see
|
|
51
|
+
"OS default location" below), so step 1 is only ever needed once per
|
|
52
|
+
machine -- rerun it later just to refresh, not per-project.
|
|
53
|
+
|
|
54
|
+
**If you install the skill without doing step 1 first** (or its corpus goes
|
|
55
|
+
stale), it notices and tells you what to run rather than guessing or
|
|
56
|
+
answering ungrounded -- it never launches the ~30-40 min scrape on its own
|
|
57
|
+
mid-conversation. See "Step 1/2" in `skills/liferay-expert/SKILL.md` for
|
|
58
|
+
that check.
|
|
59
|
+
|
|
60
|
+
## Reference: the scraper in detail
|
|
61
|
+
|
|
62
|
+
Requires Python 3.10-3.13 (crawl4ai's Playwright dependency doesn't yet
|
|
63
|
+
support 3.14) and [uv](https://docs.astral.sh/uv/).
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
# One-time: installs the Playwright/Chromium browser crawl4ai drives
|
|
67
|
+
uvx --from crawl4ai crawl4ai-setup
|
|
68
|
+
|
|
69
|
+
# From anywhere -- the corpus does NOT go in your current directory.
|
|
70
|
+
# Not on PyPI yet, so install straight from GitHub:
|
|
71
|
+
uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This takes roughly 30-40 minutes (BFS deep crawl of ~1900 pages across 14
|
|
75
|
+
capabilities) and writes to one shared, per-user location (so it's the same
|
|
76
|
+
corpus no matter which project you're in when the skill looks for it):
|
|
77
|
+
|
|
78
|
+
| OS | Default location |
|
|
79
|
+
|---|---|
|
|
80
|
+
| macOS | `~/Library/Application Support/liferay-docs/` |
|
|
81
|
+
| Linux | `~/.local/share/liferay-docs/` (or `$XDG_DATA_HOME/liferay-docs`) |
|
|
82
|
+
| Windows | `%LOCALAPPDATA%\liferay-docs\` |
|
|
83
|
+
|
|
84
|
+
Set `LIFERAY_DOCS_DIR` to override (e.g. to keep a project-local copy instead).
|
|
85
|
+
|
|
86
|
+
Inside that directory:
|
|
87
|
+
|
|
88
|
+
- `raw/{capability}/*.md` — the corpus, one file per page
|
|
89
|
+
- `raw/_navigation/{capability}/*.md` — pure TOC pages, kept but deprioritized
|
|
90
|
+
- `raw/_removed/{capability}/*.md` — pages confirmed gone from the live site
|
|
91
|
+
- `reports/filtered/` — URL manifests, self-hosted prune log, run summary
|
|
92
|
+
|
|
93
|
+
Re-run it anytime (weekly recommended) to refresh: it starts from zero every
|
|
94
|
+
time, so it naturally picks up new pages, updates changed ones, and
|
|
95
|
+
quarantines (never deletes) removed ones. If that directory is (or becomes)
|
|
96
|
+
a git repo -- worth doing once, purely as a local diffing tool, nothing needs
|
|
97
|
+
pushing anywhere -- it also runs `check-regressions` automatically afterward
|
|
98
|
+
and flags any file that shrank by more than half or grew more than 3x versus
|
|
99
|
+
the last commit (signals of a broken fetch); see
|
|
100
|
+
`docs/adr/0001-crawl4ai-based-corpus-pipeline.md` for why that check exists.
|
|
101
|
+
|
|
102
|
+
## Reference: the skill in detail
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
npx skills add mordonez/liferay-docs-scraper --skill liferay-expert
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Or just copy `skills/liferay-expert/SKILL.md` into `.claude/skills/liferay-expert/`
|
|
109
|
+
in any project. Claude Code picks it up automatically; the skill itself
|
|
110
|
+
resolves `$LIFERAY_DOCS_DIR` (or the OS default above) to find the corpus,
|
|
111
|
+
so it works the same regardless of which project you installed it into.
|
|
112
|
+
|
|
113
|
+
## Why no bundled docs, no embeddings, no vector DB
|
|
114
|
+
|
|
115
|
+
See `docs/adr/` for the full reasoning. Short version: the corpus is
|
|
116
|
+
Liferay's copyrighted documentation text -- distributing the *tool* that
|
|
117
|
+
scrapes public pages is a different, much lower-risk thing than a third
|
|
118
|
+
party redistributing that text at scale. Plain grep + Read over ~1800
|
|
119
|
+
well-organized Markdown files is fast enough that no search index is needed;
|
|
120
|
+
add one later if that stops being true.
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
# 0001: Use crawl4ai (self-hosted) for the entire discovery + extraction pipeline
|
|
2
|
+
|
|
3
|
+
- **Status:** Accepted
|
|
4
|
+
- **Date:** 2026-07-01
|
|
5
|
+
- **Scope:** `scripts/crawl4ai_pipeline.py`, `scripts/filter_urls.py`, `scripts/check_regressions.py`, `raw/`, `reports/filtered/`
|
|
6
|
+
|
|
7
|
+
## Context
|
|
8
|
+
|
|
9
|
+
The corpus is a Markdown mirror of `learn.liferay.com/w/dxp/*` (all 14
|
|
10
|
+
capabilities: cloud, search, self-hosted, sites, security, development,
|
|
11
|
+
commerce, personalization, low-code, content-management-system,
|
|
12
|
+
digital-asset-management, integration, ai, getting-started), used as source
|
|
13
|
+
material for a later, separate distillation phase (`docs/distill_system_prompt.txt`,
|
|
14
|
+
`scripts/classify_pages.py`, `scripts/build_nav_index.py`,
|
|
15
|
+
`scripts/save_distilled.py` — untouched by this ADR).
|
|
16
|
+
|
|
17
|
+
The project needs to **re-run the whole pipeline from scratch every week**
|
|
18
|
+
to pick up new pages, detect removed pages, and refresh changed content.
|
|
19
|
+
That recurring, unattended nature is why the tool choice and safety
|
|
20
|
+
behavior below matter more than they would for a one-off scrape.
|
|
21
|
+
|
|
22
|
+
### Starting point: Firecrawl
|
|
23
|
+
|
|
24
|
+
The pipeline originally used Firecrawl (`firecrawl map` for URL discovery,
|
|
25
|
+
then `firecrawl scrape` per URL for content, both via the already-authenticated
|
|
26
|
+
`firecrawl` CLI, no API key management needed). This worked, but:
|
|
27
|
+
|
|
28
|
+
- Firecrawl is a paid API with a monthly credit budget (1,000 credits/cycle
|
|
29
|
+
observed) and a 2-concurrent-job cap on the plan in use. A single full run
|
|
30
|
+
over ~1,000 URLs consumed nearly the entire monthly budget by itself —
|
|
31
|
+
incompatible with running the pipeline **every week**.
|
|
32
|
+
- `only_main_content=true` still included the breadcrumb, sidebar TOC, a
|
|
33
|
+
maintenance banner, and the full global site footer, requiring a
|
|
34
|
+
post-hoc cleanup pass (`scripts/clean_boilerplate.py`) with its own
|
|
35
|
+
regex-based header/footer cutting logic and a "give up, don't touch it"
|
|
36
|
+
fallback for pages whose template didn't match the expected shape.
|
|
37
|
+
|
|
38
|
+
## Decision
|
|
39
|
+
|
|
40
|
+
Replace Firecrawl entirely with **crawl4ai**, a free, self-hosted,
|
|
41
|
+
Playwright-based crawler, run from an isolated Python 3.13 virtualenv
|
|
42
|
+
(`.venv-crawl4ai/`, kept out of git) because crawl4ai's dependency chain
|
|
43
|
+
did not yet support the system's Python 3.14 at the time of writing.
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
python3.13 -m venv .venv-crawl4ai
|
|
47
|
+
source .venv-crawl4ai/bin/activate
|
|
48
|
+
pip install crawl4ai
|
|
49
|
+
crawl4ai-setup # installs the Playwright/Patchright Chromium builds
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
A single `crawl4ai` **BFS deep crawl** (`BFSDeepCrawlStrategy`, seeded at
|
|
53
|
+
`/w/dxp/index`) now does both discovery and extraction in one pass per
|
|
54
|
+
page:
|
|
55
|
+
|
|
56
|
+
- crawl4ai extracts a page's outbound links from the **full page DOM**
|
|
57
|
+
regardless of `css_selector` — confirmed by reading
|
|
58
|
+
`content_scraping_strategy.py`'s `_process_element(url, body, ...)` call,
|
|
59
|
+
which always receives the unfiltered `body`, not the `css_selector`-scoped
|
|
60
|
+
`content_element`. This means the same fetch that gives us clean content
|
|
61
|
+
also gives us the links needed to keep discovering the rest of the site;
|
|
62
|
+
no separate "map" phase is needed.
|
|
63
|
+
- Each visited page is classified against `filter_urls.CAPABILITIES` (14
|
|
64
|
+
capability path prefixes) plus the self-hosted-specific prune rules
|
|
65
|
+
(`SELF_HOSTED_PRUNE_RULES`, for quarterly deprecation/breaking-change
|
|
66
|
+
subpages and a couple of legacy install-doc trees that are out of scope).
|
|
67
|
+
In-scope, unpruned pages are written to `raw/{capability}/{slug}.md`.
|
|
68
|
+
|
|
69
|
+
### Content selector: `.learn-article-content`, not `#main-content`
|
|
70
|
+
|
|
71
|
+
Firecrawl's cleanup problem (breadcrumb/TOC/banner/footer chrome mixed into
|
|
72
|
+
"main content") does not require a Firecrawl-specific fix — it was a
|
|
73
|
+
selector-precision problem. Inspecting the rendered DOM found that
|
|
74
|
+
`learn.liferay.com`'s article template renders:
|
|
75
|
+
|
|
76
|
+
- the maintenance banner and the global site footer as **siblings outside**
|
|
77
|
+
`#main-content`,
|
|
78
|
+
- but the breadcrumb, sidebar TOC, and "Submit Feedback" button **inside**
|
|
79
|
+
`#main-content`, alongside the real article,
|
|
80
|
+
- with the real article (title, body, Resource Type / Feature / Deployment
|
|
81
|
+
Approach tags) isolated in a **nested** element carrying the class
|
|
82
|
+
`.learn-article-content`.
|
|
83
|
+
|
|
84
|
+
Setting `css_selector=".learn-article-content"` instead of `"#main-content"`
|
|
85
|
+
therefore returns already-clean Markdown with no further processing needed.
|
|
86
|
+
This was verified **byte-for-byte identical** to the output of the old
|
|
87
|
+
regex-based header-cut logic on a sample page, and it even fixed the one
|
|
88
|
+
page template (a legacy Knowledge Base article) that the regex approach
|
|
89
|
+
couldn't handle and had to skip.
|
|
90
|
+
|
|
91
|
+
**Result:** all of `scripts/clean_boilerplate.py` (frontmatter-splitting,
|
|
92
|
+
header-cut regex, footer-cut regex, the `NotCleanable` fallback path, and
|
|
93
|
+
its whole CLI) became dead code and was deleted. `crawl4ai_pipeline.py`
|
|
94
|
+
does not need a chrome-stripping step at all.
|
|
95
|
+
|
|
96
|
+
### Link-following exclusions: `ContentTypeFilter`, not a hand-written extension list
|
|
97
|
+
|
|
98
|
+
An early full run picked up several bogus entries in `raw/` — `.zip`,
|
|
99
|
+
`.js`, and `.png` asset links (referenced *from* doc pages, e.g. sample
|
|
100
|
+
downloads and screenshots) that got treated as if they were HTML pages,
|
|
101
|
+
producing empty garbage files. The first fix was a hand-written
|
|
102
|
+
`URLPatternFilter(patterns=["*.zip", "*.js", "*.png", ...], reverse=True)`
|
|
103
|
+
covering 11 extensions.
|
|
104
|
+
|
|
105
|
+
That list was replaced with crawl4ai's built-in
|
|
106
|
+
`ContentTypeFilter(allowed_types=["text/html"])`, which ships an 80+
|
|
107
|
+
extension → MIME-type map and checks the resource type properly instead of
|
|
108
|
+
guessing from a URL suffix — including malformed cases like
|
|
109
|
+
`...icon-actions.pngmd` (a mangled relative-link join our extension list
|
|
110
|
+
would have missed).
|
|
111
|
+
|
|
112
|
+
### Discovery strategy considered and rejected: sitemap-based `AsyncUrlSeeder` / `DomainMapper`
|
|
113
|
+
|
|
114
|
+
crawl4ai also ships `AsyncUrlSeeder` (bulk URL discovery from a sitemap or
|
|
115
|
+
Common Crawl) and `DomainMapper` (8-source domain reconnaissance: sitemap,
|
|
116
|
+
Common Crawl, Wayback Machine, crt.sh, robots.txt, RSS/Atom feeds, homepage
|
|
117
|
+
link extraction, and common-path probing). Both were evaluated as a
|
|
118
|
+
possible replacement for the BFS crawl, since BFS is inherently limited to
|
|
119
|
+
pages reachable by link-following from the seed and did miss some live
|
|
120
|
+
pages (see "known limitation" below).
|
|
121
|
+
|
|
122
|
+
Rejected for this site: `learn.liferay.com/sitemap.xml` is a **sitemap
|
|
123
|
+
index** of 72 per-CMS-layout sub-sitemaps (organized by the page-builder's
|
|
124
|
+
internal layout IDs, not by the KB article tree), and the sub-sitemaps
|
|
125
|
+
sampled were empty. This structure doesn't map usefully onto the `/w/dxp/*`
|
|
126
|
+
KB content this project needs, so sitemap-based seeding would add
|
|
127
|
+
complexity without solving the coverage gap. `DomainMapper`'s extra
|
|
128
|
+
sources (Wayback, crt.sh, Common Crawl) are aimed at discovering unknown
|
|
129
|
+
subdomains/hosts, which isn't relevant to a single, well-known host — using
|
|
130
|
+
it here would just be slower for no benefit. BFS deep crawling remains the
|
|
131
|
+
right tool for this site.
|
|
132
|
+
|
|
133
|
+
### Reliability hardening
|
|
134
|
+
|
|
135
|
+
Two content-integrity bug classes surfaced during the first full 14-capability
|
|
136
|
+
run, neither of which crawl4ai itself flags as a failure (`result.success`
|
|
137
|
+
is `True` in both cases):
|
|
138
|
+
|
|
139
|
+
1. **Client-side error banner instead of real content** ("An unexpected
|
|
140
|
+
error occurred.") — a transient render/server hiccup.
|
|
141
|
+
2. **Wrong or truncated content** — one page briefly returned a *sibling*
|
|
142
|
+
page's content (session/concurrency mixup); another was cut off
|
|
143
|
+
mid-section (incomplete render).
|
|
144
|
+
|
|
145
|
+
Mitigations, in order of where they act:
|
|
146
|
+
|
|
147
|
+
- `wait_for=f"css:{CONTENT_SELECTOR}"` on every fetch: poll for the article
|
|
148
|
+
container to exist in the DOM before extracting, instead of capturing at
|
|
149
|
+
a fixed point in the load sequence. Verified to add no measurable latency.
|
|
150
|
+
This should reduce (not necessarily eliminate) class 2 above; it doesn't
|
|
151
|
+
address class 1, and a session-mixup like the sibling-content case isn't
|
|
152
|
+
a DOM-readiness problem at all.
|
|
153
|
+
- `is_broken_content()`: flags a body under 30 characters or containing
|
|
154
|
+
the known error-banner text, and triggers up to 2 isolated re-fetches
|
|
155
|
+
(`refetch_single_page`, outside the deep crawl, with backoff) before
|
|
156
|
+
giving up. If still broken, the page is **never written** — an existing
|
|
157
|
+
good file is left untouched rather than overwritten with garbage, and the
|
|
158
|
+
URL is reported under "fetch failures" for manual attention.
|
|
159
|
+
- `scripts/check_regressions.py`, run **after** every pipeline run, diffs
|
|
160
|
+
every changed `raw/**/*.md` body (frontmatter excluded) against a given
|
|
161
|
+
git ref (default `HEAD`) and flags:
|
|
162
|
+
- **shrinkage** below 50% of the previous size (default
|
|
163
|
+
`--shrink-threshold`) — this is what actually caught the truncated
|
|
164
|
+
"wrong sibling content" case in review, since that page's body wasn't
|
|
165
|
+
short or error-banner text, just *wrong*, which the inline checks above
|
|
166
|
+
cannot detect (they have no notion of "what this URL is supposed to
|
|
167
|
+
contain" — only git history does).
|
|
168
|
+
- **growth** beyond 3x the previous size (default `--growth-threshold`)
|
|
169
|
+
— added as a safety net for the one failure mode the selector-based
|
|
170
|
+
approach introduces: if `.learn-article-content` ever fails to match on
|
|
171
|
+
some future page, crawl4ai's scraping strategy falls back to the whole
|
|
172
|
+
page body instead of raising, which would show up as an abnormally
|
|
173
|
+
*large* file rather than a short one.
|
|
174
|
+
|
|
175
|
+
**This two-sided check only exists because git is now initialized in
|
|
176
|
+
this repo and every pipeline run is committed.** Before git was
|
|
177
|
+
introduced (partway through the migration, after some content had
|
|
178
|
+
already been overwritten with no way back), there was no way to answer
|
|
179
|
+
"did this run silently corrupt something" other than manual URL-by-URL
|
|
180
|
+
spot checks. Every future run should be followed by
|
|
181
|
+
`check_regressions.py` and a commit before starting the next one.
|
|
182
|
+
|
|
183
|
+
### Removed-content handling: verify-before-quarantine, never hard-delete
|
|
184
|
+
|
|
185
|
+
Because every run starts from zero, a page that existed after the last run
|
|
186
|
+
but wasn't found this run is ambiguous: it could be genuinely removed from
|
|
187
|
+
the site, or it could simply be a page BFS didn't re-discover this time
|
|
188
|
+
(still reachable directly by URL, just no longer linked from wherever the
|
|
189
|
+
crawl reached). The first full run treated "not rediscovered" as "removed"
|
|
190
|
+
and quarantined (moved to `raw/_removed/{capability}/`, logged to
|
|
191
|
+
`reports/filtered/removed_log.jsonl`) 21 pages — **all 21 turned out to
|
|
192
|
+
still be live** (manually verified with direct `curl` requests). This was
|
|
193
|
+
a real near-miss: nothing was permanently lost only because quarantine
|
|
194
|
+
moves files instead of deleting them, but it was a false-positive rate of
|
|
195
|
+
100% on that first attempt.
|
|
196
|
+
|
|
197
|
+
Fixed with `is_confirmed_gone(url)`: before quarantining any orphan
|
|
198
|
+
candidate, do a direct `HEAD` request (via `urllib.request`, no browser)
|
|
199
|
+
straight to that URL. Only a confirmed `404`/`410` gets quarantined.
|
|
200
|
+
**Any other outcome — 200, a different error, a timeout, a network hiccup
|
|
201
|
+
on our end — is treated as "not confirmed," and the file is left in place**
|
|
202
|
+
and reported separately ("still alive, BFS coverage gap") for manual
|
|
203
|
+
review. As of this writing, 21 pages remain in that "known-live-but-currently-unlinked"
|
|
204
|
+
state every run — a real, small, accepted gap in BFS coverage, not a bug to
|
|
205
|
+
chase further given the sitemap-seeding alternative was rejected above.
|
|
206
|
+
|
|
207
|
+
Additionally, if a capability's in-scope page count drops below 50% of its
|
|
208
|
+
previous count (`QUARANTINE_SAFETY_RATIO`), quarantine is skipped
|
|
209
|
+
*entirely* for that capability and flagged for manual review — protecting
|
|
210
|
+
against a partially-failed crawl run being mistaken for mass content
|
|
211
|
+
removal.
|
|
212
|
+
|
|
213
|
+
### Scope: all 14 capabilities, not just 6
|
|
214
|
+
|
|
215
|
+
The pipeline originally targeted 6 of the 14 capabilities listed on
|
|
216
|
+
`/w/dxp/index` (cloud, search, self-hosted, sites, security, development),
|
|
217
|
+
matching a Firecrawl-era credit budget that made processing all ~1,900
|
|
218
|
+
pages impractical. Once crawl4ai removed the cost constraint, scope was
|
|
219
|
+
expanded to all 14 (`filter_urls.CAPABILITIES` now lists all of them;
|
|
220
|
+
`OUT_OF_SCOPE_PREFIXES` is empty). The BFS crawl already visited every page
|
|
221
|
+
under `/w/dxp/*` regardless of capability scope (needed it to know what to
|
|
222
|
+
exclude), so widening scope did not meaningfully increase crawl time — it
|
|
223
|
+
only changed which already-fetched pages get persisted.
|
|
224
|
+
|
|
225
|
+
## Consequences
|
|
226
|
+
|
|
227
|
+
**Positive**
|
|
228
|
+
|
|
229
|
+
- No per-page cost and no concurrency cap tied to a paid plan — the
|
|
230
|
+
pipeline can run in full every week indefinitely.
|
|
231
|
+
- Cleaner content at the source (CSS selector) instead of post-hoc regex
|
|
232
|
+
cleanup — less code, and it fixed a page template regex cutting
|
|
233
|
+
couldn't handle.
|
|
234
|
+
- Full 14-capability coverage, not a 6-capability subset chosen for cost
|
|
235
|
+
reasons.
|
|
236
|
+
- `scripts/` shrank from 9 files to 6 relevant ones (`crawl4ai_pipeline.py`,
|
|
237
|
+
`filter_urls.py`, `check_regressions.py`, plus the untouched distillation
|
|
238
|
+
scripts): `extract_content.py`, `poc_crawl4ai.py`, and
|
|
239
|
+
`clean_boilerplate.py` were deleted as dead weight, along with the
|
|
240
|
+
one-time `reports/dxp_urls.json` Firecrawl-map dump.
|
|
241
|
+
|
|
242
|
+
**Negative / accepted risks**
|
|
243
|
+
|
|
244
|
+
- Requires a separate Python 3.13 virtualenv and ~350MB of downloaded
|
|
245
|
+
Chromium browser builds, kept outside git (`.venv-crawl4ai/` is
|
|
246
|
+
gitignored) — an operational dependency Firecrawl (a hosted API) didn't
|
|
247
|
+
have.
|
|
248
|
+
- Self-hosted headless-browser crawling surfaced content-integrity bugs
|
|
249
|
+
(error banners reported as success, cross-page content mixups,
|
|
250
|
+
truncated renders) that a hosted, more mature scraping API might handle
|
|
251
|
+
internally. Mitigated as described above, but not eliminated with
|
|
252
|
+
certainty — `check_regressions.py` after every run is a required step,
|
|
253
|
+
not an optional nicety.
|
|
254
|
+
- BFS link-following has a small, accepted coverage gap (~1% of pages,
|
|
255
|
+
currently 21) for pages that are live but unlinked from anywhere our
|
|
256
|
+
crawl reaches. These are never silently dropped (verify-before-quarantine
|
|
257
|
+
keeps them in place) but they also don't get refreshed automatically;
|
|
258
|
+
periodic manual review of the "still alive" report is needed.
|
|
259
|
+
- The BFS crawl visits every page under `/w/dxp/*` (~1,900+) even though
|
|
260
|
+
only pages matching a known capability prefix get persisted — some
|
|
261
|
+
wasted rendering work compared to a hypothetically perfect targeted
|
|
262
|
+
crawl, judged acceptable since it's free and still fast (tens of minutes,
|
|
263
|
+
not hours).
|
|
264
|
+
|
|
265
|
+
## Lessons learned (for future runs / future migrations like this one)
|
|
266
|
+
|
|
267
|
+
1. **Initialize version control *before* running anything that overwrites
|
|
268
|
+
a corpus in place, not after.** This repo had no git history when the
|
|
269
|
+
first full crawl4ai run started; by the time `git init` happened
|
|
270
|
+
(prompted by the user asking "shouldn't we have backed this up first?"),
|
|
271
|
+
a large fraction of `raw/` had already been overwritten with no way to
|
|
272
|
+
recover the pre-migration (Firecrawl) content. Nothing was lost in the
|
|
273
|
+
end (the new content was verified equivalent), but it was luck, not
|
|
274
|
+
process, that made that true.
|
|
275
|
+
2. **"Not rediscovered by a link-following crawl" is not proof of
|
|
276
|
+
removal.** Verify with a direct request to the specific URL before
|
|
277
|
+
taking any destructive/quarantine action based on absence.
|
|
278
|
+
3. **A tool reporting `success: True` is not proof the content is
|
|
279
|
+
correct.** Both content-integrity bugs found here were fetches crawl4ai
|
|
280
|
+
considered successful. Application-level validation (length/marker
|
|
281
|
+
checks, and especially diffing against known-good history) is still
|
|
282
|
+
necessary on top of the library's own success/failure signal.
|
|
283
|
+
4. **Prefer a precise CSS selector over post-hoc cleanup regex.** Spending
|
|
284
|
+
time inspecting the actual rendered DOM (via a throwaway
|
|
285
|
+
`BeautifulSoup` pass over `result.cleaned_html`) to find a tighter,
|
|
286
|
+
purpose-built class (`.learn-article-content`) eliminated an entire
|
|
287
|
+
category of custom cleanup code, rather than making that code more
|
|
288
|
+
robust.
|
|
289
|
+
5. **Prefer a library's built-in filter over a hand-maintained list**
|
|
290
|
+
when one exists and fits (`ContentTypeFilter` vs. a manually maintained
|
|
291
|
+
file-extension list) — it's both less code and more correct.
|
|
292
|
+
6. **Re-evaluate "which tool for URL discovery" per-site, not in the
|
|
293
|
+
abstract.** Sitemap-based seeding is the officially recommended
|
|
294
|
+
"fast path" for bulk discovery in crawl4ai's own docs, but this
|
|
295
|
+
specific site's sitemap structure (CMS-layout-based, not
|
|
296
|
+
content-tree-based) made it a worse fit than BFS deep crawling despite
|
|
297
|
+
BFS's own coverage gap.
|
|
298
|
+
|
|
299
|
+
## Follow-ups (explicitly deferred, not part of this decision)
|
|
300
|
+
|
|
301
|
+
- No cron/scheduled job has been configured yet to actually run this
|
|
302
|
+
weekly and unattended — `crawl4ai_pipeline.py` is ready for that, but
|
|
303
|
+
wiring up the recurring execution was intentionally left as a separate,
|
|
304
|
+
explicit step requiring its own confirmation.
|
|
305
|
+
- The 21 "known-live-but-unlinked" pages are not being actively
|
|
306
|
+
re-fetched; they need either periodic manual attention or a future
|
|
307
|
+
decision on whether to seed them explicitly (e.g., a small hardcoded
|
|
308
|
+
seed list) so they participate in the diffing/refresh cycle like
|
|
309
|
+
everything else.
|
|
310
|
+
- The distillation phase (`classify_pages.py`, `build_nav_index.py`,
|
|
311
|
+
`save_distilled.py`, `docs/distill_system_prompt.txt`) is unrelated to
|
|
312
|
+
and unaffected by this decision.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "liferay-docs-scraper"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Scrape learn.liferay.com/w/dxp into a local Markdown corpus (raw/{capability}/*.md) for the liferay-expert Claude Code skill."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10,<3.14"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"crawl4ai>=0.9.0",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[project.scripts]
|
|
12
|
+
liferay-docs-scraper = "liferay_docs_scraper.pipeline:main"
|
|
13
|
+
check-regressions = "liferay_docs_scraper.check_regressions:main"
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["hatchling"]
|
|
17
|
+
build-backend = "hatchling.build"
|
|
18
|
+
|
|
19
|
+
[tool.hatch.build.targets.wheel]
|
|
20
|
+
packages = ["src/liferay_docs_scraper"]
|