liferay-docs-scraper 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- liferay_docs_scraper/__init__.py +0 -0
- liferay_docs_scraper/check_regressions.py +105 -0
- liferay_docs_scraper/classify_pages.py +41 -0
- liferay_docs_scraper/filter_urls.py +151 -0
- liferay_docs_scraper/pipeline.py +451 -0
- liferay_docs_scraper-0.1.0.dist-info/METADATA +128 -0
- liferay_docs_scraper-0.1.0.dist-info/RECORD +9 -0
- liferay_docs_scraper-0.1.0.dist-info/WHEEL +4 -0
- liferay_docs_scraper-0.1.0.dist-info/entry_points.txt +3 -0
|
File without changes
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Flag suspicious content changes in raw/ after a crawl4ai refresh, using git.
|
|
3
|
+
|
|
4
|
+
Compares the working tree against a given git ref (default: HEAD, i.e. the
|
|
5
|
+
last commit) for every raw/**/*.md file:
|
|
6
|
+
- Shrank a lot (body text, not counting frontmatter): the signature of a
|
|
7
|
+
broken/partial fetch overwriting a good page.
|
|
8
|
+
- Grew a lot: the signature of CONTENT_SELECTOR failing to match and
|
|
9
|
+
crawl4ai falling back to the whole page (breadcrumb/nav/footer chrome
|
|
10
|
+
and all) instead of raising an error.
|
|
11
|
+
|
|
12
|
+
Operates on filter_urls.resolve_docs_dir() (the same shared corpus location
|
|
13
|
+
the scraper writes to and the skill reads from), not the current directory.
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
uvx --from liferay-docs-scraper check-regressions [--ref HEAD] [--shrink-threshold 0.5] [--growth-threshold 3.0]
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import subprocess
|
|
21
|
+
|
|
22
|
+
from .filter_urls import resolve_docs_dir
|
|
23
|
+
|
|
24
|
+
ROOT = resolve_docs_dir()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def body_len(text: str) -> int:
|
|
28
|
+
"""Length of the file excluding the YAML frontmatter block."""
|
|
29
|
+
parts = text.split("---\n", 2)
|
|
30
|
+
body = parts[2] if len(parts) == 3 else text
|
|
31
|
+
return len(body)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def git_show(ref: str, path: str) -> str | None:
|
|
35
|
+
result = subprocess.run(
|
|
36
|
+
["git", "show", f"{ref}:{path}"], cwd=ROOT, capture_output=True, text=True,
|
|
37
|
+
)
|
|
38
|
+
if result.returncode != 0:
|
|
39
|
+
return None
|
|
40
|
+
return result.stdout
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def changed_raw_files(ref: str) -> list[str]:
|
|
44
|
+
result = subprocess.run(
|
|
45
|
+
["git", "diff", "--name-only", ref, "--", "raw/"],
|
|
46
|
+
cwd=ROOT, capture_output=True, text=True, check=True,
|
|
47
|
+
)
|
|
48
|
+
return [line for line in result.stdout.splitlines() if line.endswith(".md")]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def run_check(ref: str = "HEAD", shrink_threshold: float = 0.5, growth_threshold: float = 3.0) -> bool:
|
|
52
|
+
"""Print the regression report; return True iff something looked suspicious."""
|
|
53
|
+
changed = changed_raw_files(ref)
|
|
54
|
+
print(f"Archivos .md cambiados en raw/ vs {ref}: {len(changed)}")
|
|
55
|
+
|
|
56
|
+
shrunk, grew = [], []
|
|
57
|
+
for rel_path in changed:
|
|
58
|
+
full_path = ROOT / rel_path
|
|
59
|
+
old_text = git_show(ref, rel_path)
|
|
60
|
+
if old_text is None:
|
|
61
|
+
continue # new file, nothing to compare
|
|
62
|
+
if not full_path.exists():
|
|
63
|
+
continue # deleted/moved (e.g. quarantined), handled separately
|
|
64
|
+
|
|
65
|
+
new_text = full_path.read_text(encoding="utf-8")
|
|
66
|
+
old_len = body_len(old_text)
|
|
67
|
+
new_len = body_len(new_text)
|
|
68
|
+
if old_len == 0:
|
|
69
|
+
continue
|
|
70
|
+
ratio = new_len / old_len
|
|
71
|
+
if ratio < shrink_threshold:
|
|
72
|
+
shrunk.append((rel_path, old_len, new_len, ratio))
|
|
73
|
+
elif ratio > growth_threshold:
|
|
74
|
+
grew.append((rel_path, old_len, new_len, ratio))
|
|
75
|
+
|
|
76
|
+
if shrunk:
|
|
77
|
+
print(f"\nSOSPECHOSOS ({len(shrunk)}) -- perdieron más del "
|
|
78
|
+
f"{(1 - shrink_threshold) * 100:.0f}% del contenido:")
|
|
79
|
+
for rel_path, old_len, new_len, ratio in sorted(shrunk, key=lambda x: x[3]):
|
|
80
|
+
print(f" {rel_path}: {old_len} -> {new_len} chars ({ratio:.0%})")
|
|
81
|
+
else:
|
|
82
|
+
print("\nNinguno por debajo del umbral de encogimiento -- sin señales de pérdida de contenido.")
|
|
83
|
+
|
|
84
|
+
if grew:
|
|
85
|
+
print(f"\nSOSPECHOSOS ({len(grew)}) -- crecieron más de {growth_threshold:.0f}x "
|
|
86
|
+
f"(posible fallback a la página completa sin selector):")
|
|
87
|
+
for rel_path, old_len, new_len, ratio in sorted(grew, key=lambda x: -x[3]):
|
|
88
|
+
print(f" {rel_path}: {old_len} -> {new_len} chars ({ratio:.1f}x)")
|
|
89
|
+
|
|
90
|
+
return bool(shrunk or grew)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def main() -> None:
|
|
94
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
95
|
+
parser.add_argument("--ref", default="HEAD")
|
|
96
|
+
parser.add_argument("--shrink-threshold", type=float, default=0.5,
|
|
97
|
+
help="Flag files whose body shrank below this fraction of the original.")
|
|
98
|
+
parser.add_argument("--growth-threshold", type=float, default=3.0,
|
|
99
|
+
help="Flag files whose body grew beyond this multiple of the original.")
|
|
100
|
+
args = parser.parse_args()
|
|
101
|
+
run_check(args.ref, args.shrink_threshold, args.growth_threshold)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
main()
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Navigation-vs-content heuristic, shared with crawl4ai_pipeline.py.
|
|
3
|
+
|
|
4
|
+
An "index"/navigation page is one whose body (frontmatter stripped) is
|
|
5
|
+
short and consists mostly of links to subpages -- no substantial technical
|
|
6
|
+
content of its own. Everything else is "content".
|
|
7
|
+
|
|
8
|
+
Heuristic (no API calls, pure text analysis):
|
|
9
|
+
- Strip Markdown link syntax down to visible text (`[text](url)` -> `text`)
|
|
10
|
+
and strip heading/emphasis markup (`#`, `*`, `_`, backticks).
|
|
11
|
+
- total_words: word count of that visible text.
|
|
12
|
+
- link_ratio: fraction of those words that come from inside a Markdown
|
|
13
|
+
link's link-text span.
|
|
14
|
+
- "index" iff total_words < INDEX_MAX_WORDS and link_ratio >= INDEX_MIN_LINK_RATIO.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
|
|
19
|
+
INDEX_MAX_WORDS = 150
|
|
20
|
+
INDEX_MIN_LINK_RATIO = 0.5
|
|
21
|
+
|
|
22
|
+
LINK_RE = re.compile(r"\[([^\]]*)\]\(([^)]*)\)")
|
|
23
|
+
MARKUP_RE = re.compile(r"[#*_`\\]")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def analyze_body(body: str) -> tuple[int, float]:
|
|
27
|
+
links = LINK_RE.findall(body)
|
|
28
|
+
link_word_count = sum(len(text.split()) for text, _url in links)
|
|
29
|
+
|
|
30
|
+
visible = LINK_RE.sub(lambda m: m.group(1), body)
|
|
31
|
+
visible = MARKUP_RE.sub("", visible)
|
|
32
|
+
total_words = len(visible.split())
|
|
33
|
+
|
|
34
|
+
link_ratio = (link_word_count / total_words) if total_words else 0.0
|
|
35
|
+
return total_words, link_ratio
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def classify(total_words: int, link_ratio: float) -> str:
|
|
39
|
+
if total_words < INDEX_MAX_WORDS and link_ratio >= INDEX_MIN_LINK_RATIO:
|
|
40
|
+
return "index"
|
|
41
|
+
return "content"
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared URL/capability utilities for the crawl4ai_pipeline.py corpus.
|
|
3
|
+
|
|
4
|
+
Capability classification (matching learn.liferay.com/w/dxp URLs to one of
|
|
5
|
+
the 14 capabilities listed on /w/dxp/index, plus the self-hosted prune
|
|
6
|
+
rules), the URL->filename/frontmatter helpers used when writing pages to
|
|
7
|
+
raw/{capability}/*.md, and resolve_docs_dir() -- the one place that decides
|
|
8
|
+
where that raw/ corpus actually lives on disk.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from urllib.parse import urlparse
|
|
17
|
+
|
|
18
|
+
CAPABILITIES = {
|
|
19
|
+
"cloud": "/w/dxp/cloud",
|
|
20
|
+
"search": "/w/dxp/search",
|
|
21
|
+
"self-hosted": "/w/dxp/self-hosted-installation-and-upgrades",
|
|
22
|
+
"sites": "/w/dxp/sites",
|
|
23
|
+
"security": "/w/dxp/security-and-administration",
|
|
24
|
+
"development": "/w/dxp/development",
|
|
25
|
+
"commerce": "/w/dxp/commerce",
|
|
26
|
+
"personalization": "/w/dxp/personalization",
|
|
27
|
+
"low-code": "/w/dxp/low-code",
|
|
28
|
+
"content-management-system": "/w/dxp/content-management-system",
|
|
29
|
+
"digital-asset-management": "/w/dxp/digital-asset-management",
|
|
30
|
+
"integration": "/w/dxp/integration",
|
|
31
|
+
"ai": "/w/dxp/ai",
|
|
32
|
+
"getting-started": "/w/dxp/getting-started",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# All 14 capabilities listed on https://learn.liferay.com/w/dxp/index are in
|
|
36
|
+
# scope now; nothing under /w/dxp is deliberately excluded anymore.
|
|
37
|
+
OUT_OF_SCOPE_PREFIXES: list[str] = []
|
|
38
|
+
|
|
39
|
+
# (rule label, substring whose presence -- followed by more path -- excludes the URL)
|
|
40
|
+
SELF_HOSTED_PRUNE_RULES = [
|
|
41
|
+
(
|
|
42
|
+
"deprecations-and-breaking-changes-reference subpage",
|
|
43
|
+
"/upgrading-liferay/deprecations-and-breaking-changes-reference/",
|
|
44
|
+
),
|
|
45
|
+
(
|
|
46
|
+
"installing-earlier-liferay-versions-on-application-servers subpage",
|
|
47
|
+
"/installing-earlier-liferay-versions-on-application-servers/",
|
|
48
|
+
),
|
|
49
|
+
(
|
|
50
|
+
"cne-aws-ready subpage",
|
|
51
|
+
"/cloud-native-experience/cne-cloud-provider-ready/cne-aws-ready/",
|
|
52
|
+
),
|
|
53
|
+
(
|
|
54
|
+
"cne-gcp-ready subpage",
|
|
55
|
+
"/cloud-native-experience/cne-cloud-provider-ready/cne-gcp-ready/",
|
|
56
|
+
),
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def normalize(url: str) -> str:
|
|
61
|
+
"""Strip a trailing slash from the path, keep everything else as-is."""
|
|
62
|
+
parsed = urlparse(url)
|
|
63
|
+
path = parsed.path
|
|
64
|
+
if path != "/" and path.endswith("/"):
|
|
65
|
+
path = path[:-1]
|
|
66
|
+
return parsed._replace(path=path).geturl()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def matches_prefix(path: str, prefix: str) -> bool:
|
|
70
|
+
return path == prefix or path.startswith(prefix + "/")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def prune_reason(path: str) -> str | None:
|
|
74
|
+
for label, substr in SELF_HOSTED_PRUNE_RULES:
|
|
75
|
+
if substr in path:
|
|
76
|
+
return label
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def classify_url(url: str) -> dict:
|
|
81
|
+
"""Classify a single (already-normalized) URL for the capability pipeline.
|
|
82
|
+
|
|
83
|
+
Returns a dict with:
|
|
84
|
+
- capability: matched capability name, or None if out of scope
|
|
85
|
+
- prune_reason: self-hosted prune rule label, or None
|
|
86
|
+
- known_out_of_scope: True if it matches one of the known-excluded
|
|
87
|
+
capabilities rather than being an unrecognized/"odd" URL worth
|
|
88
|
+
flagging for manual review
|
|
89
|
+
"""
|
|
90
|
+
path = urlparse(url).path
|
|
91
|
+
matched_capability = None
|
|
92
|
+
for name, prefix in CAPABILITIES.items():
|
|
93
|
+
if matches_prefix(path, prefix):
|
|
94
|
+
matched_capability = name
|
|
95
|
+
break
|
|
96
|
+
|
|
97
|
+
if matched_capability is None:
|
|
98
|
+
known_out_of_scope = any(matches_prefix(path, prefix) for prefix in OUT_OF_SCOPE_PREFIXES)
|
|
99
|
+
return {"capability": None, "prune_reason": None, "known_out_of_scope": known_out_of_scope}
|
|
100
|
+
|
|
101
|
+
reason = prune_reason(path) if matched_capability == "self-hosted" else None
|
|
102
|
+
return {"capability": matched_capability, "prune_reason": reason, "known_out_of_scope": False}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def slugify(url: str, prefix: str) -> str:
|
|
106
|
+
"""URL path (with the capability prefix stripped) -> a flat filename stem."""
|
|
107
|
+
path = urlparse(url).path
|
|
108
|
+
remainder = path[len(prefix):].strip("/")
|
|
109
|
+
if not remainder:
|
|
110
|
+
return "index"
|
|
111
|
+
return remainder.replace("/", "-")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def build_frontmatter(url: str, capability: str, markdown: str) -> str:
|
|
115
|
+
fetched_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
116
|
+
content_hash = hashlib.sha256(markdown.encode("utf-8")).hexdigest()
|
|
117
|
+
lines = [
|
|
118
|
+
"---",
|
|
119
|
+
f'url: "{url}"',
|
|
120
|
+
f"capability: {capability}",
|
|
121
|
+
f'fetched_at: "{fetched_at}"',
|
|
122
|
+
f'content_hash: "sha256:{content_hash}"',
|
|
123
|
+
"---",
|
|
124
|
+
"",
|
|
125
|
+
]
|
|
126
|
+
return "\n".join(lines)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _default_data_dir() -> Path:
|
|
130
|
+
"""Per-user app-data directory, one convention per OS, so the corpus
|
|
131
|
+
lives in the same predictable place regardless of which project you
|
|
132
|
+
happen to be running the scraper or the skill from."""
|
|
133
|
+
if sys.platform == "win32":
|
|
134
|
+
base = os.environ.get("LOCALAPPDATA") or str(Path.home() / "AppData" / "Local")
|
|
135
|
+
return Path(base) / "liferay-docs"
|
|
136
|
+
if sys.platform == "darwin":
|
|
137
|
+
return Path.home() / "Library" / "Application Support" / "liferay-docs"
|
|
138
|
+
# Linux and other Unix-likes: XDG Base Directory spec
|
|
139
|
+
base = os.environ.get("XDG_DATA_HOME") or str(Path.home() / ".local" / "share")
|
|
140
|
+
return Path(base) / "liferay-docs"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def resolve_docs_dir() -> Path:
|
|
144
|
+
"""Where the local corpus (raw/, reports/filtered/) lives: $LIFERAY_DOCS_DIR
|
|
145
|
+
if set, otherwise the OS-appropriate default data directory. One shared
|
|
146
|
+
corpus regardless of the current project, unless explicitly overridden --
|
|
147
|
+
see _default_data_dir() for the per-OS default."""
|
|
148
|
+
override = os.environ.get("LIFERAY_DOCS_DIR")
|
|
149
|
+
if override:
|
|
150
|
+
return Path(override).expanduser()
|
|
151
|
+
return _default_data_dir()
|
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Weekly from-scratch refresh of the learn.liferay.com/w/dxp corpus, crawl4ai-only.
|
|
3
|
+
|
|
4
|
+
Builds raw/{capability}/*.md under filter_urls.resolve_docs_dir(): the
|
|
5
|
+
$LIFERAY_DOCS_DIR directory if that env var is set, otherwise one shared,
|
|
6
|
+
OS-appropriate per-user data directory (e.g. ~/Library/Application
|
|
7
|
+
Support/liferay-docs on macOS, %LOCALAPPDATA%\\liferay-docs on Windows,
|
|
8
|
+
~/.local/share/liferay-docs on Linux). Deliberately NOT the current working
|
|
9
|
+
directory -- the liferay-expert skill looks in that same shared location
|
|
10
|
+
regardless of which project you're in when you ask a question, so you
|
|
11
|
+
don't end up with a separate copy of the corpus per project.
|
|
12
|
+
|
|
13
|
+
A single crawl4ai deep crawl handles both URL discovery and content
|
|
14
|
+
extraction:
|
|
15
|
+
|
|
16
|
+
- A BFS deep crawl starts at /w/dxp/index and follows every internal link
|
|
17
|
+
under /w/dxp/*. crawl4ai extracts links from the FULL page regardless of
|
|
18
|
+
css_selector, so this single crawl gets us both (a) the complete current
|
|
19
|
+
set of URLs on the site and (b) each page's Markdown scoped to
|
|
20
|
+
CONTENT_SELECTOR, in one visit per page. That selector (see below) is
|
|
21
|
+
precise enough that no further chrome-stripping is needed -- what
|
|
22
|
+
crawl4ai returns is already the final page content.
|
|
23
|
+
- Each page is classified with filter_urls.py's classify_url (capability
|
|
24
|
+
prefixes + self-hosted prune rules) and, if in scope, written to
|
|
25
|
+
raw/{capability}/{slug}.md -- unless classify_pages.py's heuristic
|
|
26
|
+
(reused here, not duplicated) says it's a pure navigation/TOC page with
|
|
27
|
+
no substantial content of its own, in which case it goes to
|
|
28
|
+
raw/_navigation/{capability}/{slug}.md instead. This keeps
|
|
29
|
+
raw/{capability}/ as signal for the liferay-expert skill, while still
|
|
30
|
+
preserving the navigation pages (not deleting them) in case they're
|
|
31
|
+
useful later.
|
|
32
|
+
- Because every run starts from zero, a page that existed last run but
|
|
33
|
+
isn't found this run (removed from the site, or now out of scope/pruned)
|
|
34
|
+
is a *candidate* for quarantine -- but BFS link-following can miss a page
|
|
35
|
+
that's still live (no longer linked from anywhere our crawl reached,
|
|
36
|
+
while still resolving directly), so before quarantining anything we do a
|
|
37
|
+
direct HTTP check on each candidate's own URL. Only a confirmed non-200
|
|
38
|
+
gets quarantined (moved to raw/_removed/{capability}/{slug}.md, logged to
|
|
39
|
+
reports/filtered/removed_log.jsonl); anything that still responds, or
|
|
40
|
+
that we simply couldn't reach to check, is left in place and flagged for
|
|
41
|
+
manual review instead. If a capability's discovered count drops
|
|
42
|
+
implausibly (crawl likely failed partway), quarantine for that capability
|
|
43
|
+
is skipped entirely and flagged for manual review instead of trusting a
|
|
44
|
+
possibly-broken run.
|
|
45
|
+
- reports/filtered/{capability}_urls.txt, self-hosted_pruned.txt and
|
|
46
|
+
summary.json are regenerated from this run's live results, so they always
|
|
47
|
+
reflect the current corpus (same format filter_urls.py produces).
|
|
48
|
+
- Once everything above is written, check_regressions.py's run_check()
|
|
49
|
+
runs automatically against the last git commit, if resolve_docs_dir()
|
|
50
|
+
is itself a git repo (worth `git init`-ing once, purely as a local
|
|
51
|
+
diffing tool -- nothing needs to be pushed anywhere). Skipped otherwise,
|
|
52
|
+
or with --skip-regression-check.
|
|
53
|
+
|
|
54
|
+
Setup and run (see README.md for the full explanation):
|
|
55
|
+
uvx --from crawl4ai crawl4ai-setup # one-time: installs Playwright browsers
|
|
56
|
+
uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
|
|
57
|
+
# (not on PyPI yet -- once published, just `uvx liferay-docs-scraper`)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
import argparse
|
|
61
|
+
import asyncio
|
|
62
|
+
import json
|
|
63
|
+
import shutil
|
|
64
|
+
import sys
|
|
65
|
+
import urllib.error
|
|
66
|
+
import urllib.request
|
|
67
|
+
from dataclasses import dataclass, field
|
|
68
|
+
from datetime import datetime, timezone
|
|
69
|
+
from pathlib import Path
|
|
70
|
+
|
|
71
|
+
from .check_regressions import run_check as run_regression_check
|
|
72
|
+
from .classify_pages import analyze_body, classify as classify_navigation
|
|
73
|
+
from .filter_urls import (
|
|
74
|
+
CAPABILITIES,
|
|
75
|
+
SELF_HOSTED_PRUNE_RULES,
|
|
76
|
+
build_frontmatter,
|
|
77
|
+
classify_url,
|
|
78
|
+
normalize,
|
|
79
|
+
resolve_docs_dir,
|
|
80
|
+
slugify,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
|
84
|
+
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, ContentTypeFilter, DomainFilter, FilterChain, URLPatternFilter
|
|
85
|
+
|
|
86
|
+
ROOT = resolve_docs_dir()
|
|
87
|
+
RAW_DIR = ROOT / "raw"
|
|
88
|
+
REMOVED_DIR = RAW_DIR / "_removed"
|
|
89
|
+
NAVIGATION_DIR = RAW_DIR / "_navigation"
|
|
90
|
+
FILTERED_DIR = ROOT / "reports" / "filtered"
|
|
91
|
+
REMOVED_LOG = FILTERED_DIR / "removed_log.jsonl"
|
|
92
|
+
|
|
93
|
+
SEED_URL = "https://learn.liferay.com/w/dxp/index"
|
|
94
|
+
ALLOWED_DOMAIN = "learn.liferay.com"
|
|
95
|
+
URL_SCOPE_PATTERN = "*/w/dxp*"
|
|
96
|
+
# learn.liferay.com's article template puts the breadcrumb, sidebar TOC, and
|
|
97
|
+
# the actual article body all inside #main-content, with the maintenance
|
|
98
|
+
# banner and global footer outside it. .learn-article-content is scoped
|
|
99
|
+
# tighter still: just the title, body, and resource-type tags -- no
|
|
100
|
+
# breadcrumb/TOC/"Submit Feedback" chrome to strip afterward.
|
|
101
|
+
CONTENT_SELECTOR = ".learn-article-content"
|
|
102
|
+
|
|
103
|
+
DEFAULT_MAX_DEPTH = 12
|
|
104
|
+
DEFAULT_MAX_PAGES = 3000
|
|
105
|
+
# If a capability's freshly discovered URL count falls below this fraction of
|
|
106
|
+
# its previous count, treat the run as suspect and skip quarantining orphans
|
|
107
|
+
# for that capability rather than mass-deleting good content on a bad crawl.
|
|
108
|
+
QUARANTINE_SAFETY_RATIO = 0.5
|
|
109
|
+
|
|
110
|
+
# Some fetches render a client-side error banner instead of the real page
|
|
111
|
+
# (transient rendering/server hiccup) -- crawl4ai still reports these as a
|
|
112
|
+
# "successful" fetch, so we have to catch it ourselves and retry.
|
|
113
|
+
ERROR_MARKERS = ["An unexpected error occurred."]
|
|
114
|
+
MIN_ACCEPTABLE_BODY_LENGTH = 30
|
|
115
|
+
CONTENT_RETRY_ATTEMPTS = 3
|
|
116
|
+
CONTENT_RETRY_DELAY_SECONDS = 3.0
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class PageOutcome:
|
|
121
|
+
url: str
|
|
122
|
+
capability: str
|
|
123
|
+
slug: str
|
|
124
|
+
status: str # "new" | "updated" | "unchanged"
|
|
125
|
+
is_navigation: bool = False
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class RunStats:
|
|
130
|
+
discovered_total: int = 0
|
|
131
|
+
fetch_failed: list[str] = field(default_factory=list)
|
|
132
|
+
unmatched: list[str] = field(default_factory=list)
|
|
133
|
+
pruned: list[tuple] = field(default_factory=list)
|
|
134
|
+
outcomes: dict = field(default_factory=lambda: {name: [] for name in CAPABILITIES})
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def read_existing_hash(path: Path) -> str | None:
|
|
138
|
+
if not path.exists():
|
|
139
|
+
return None
|
|
140
|
+
with path.open(encoding="utf-8") as f:
|
|
141
|
+
for line in f:
|
|
142
|
+
if line.startswith("content_hash:"):
|
|
143
|
+
return line.strip()
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def is_broken_content(markdown: str) -> bool:
|
|
148
|
+
"""Detect a client-side error banner or a suspiciously empty fetch."""
|
|
149
|
+
stripped = markdown.strip()
|
|
150
|
+
if len(stripped) < MIN_ACCEPTABLE_BODY_LENGTH:
|
|
151
|
+
return True
|
|
152
|
+
return any(marker in stripped for marker in ERROR_MARKERS)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def build_deep_crawl_config(max_depth: int, max_pages: int) -> CrawlerRunConfig:
|
|
156
|
+
filter_chain = FilterChain([
|
|
157
|
+
DomainFilter(allowed_domains=[ALLOWED_DOMAIN]),
|
|
158
|
+
URLPatternFilter(patterns=[URL_SCOPE_PATTERN]),
|
|
159
|
+
ContentTypeFilter(allowed_types=["text/html"]),
|
|
160
|
+
])
|
|
161
|
+
strategy = BFSDeepCrawlStrategy(
|
|
162
|
+
max_depth=max_depth, filter_chain=filter_chain, max_pages=max_pages, include_external=False,
|
|
163
|
+
)
|
|
164
|
+
return CrawlerRunConfig(
|
|
165
|
+
deep_crawl_strategy=strategy,
|
|
166
|
+
css_selector=CONTENT_SELECTOR,
|
|
167
|
+
wait_for=f"css:{CONTENT_SELECTOR}",
|
|
168
|
+
cache_mode=CacheMode.BYPASS,
|
|
169
|
+
stream=True,
|
|
170
|
+
verbose=False,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
async def refetch_single_page(crawler: AsyncWebCrawler, url: str) -> str | None:
|
|
175
|
+
"""Re-fetch one URL outside the deep crawl (used when the deep crawl's
|
|
176
|
+
copy looked broken). Returns the page's Markdown, or None if every
|
|
177
|
+
attempt still looks broken."""
|
|
178
|
+
single_config = CrawlerRunConfig(
|
|
179
|
+
css_selector=CONTENT_SELECTOR, wait_for=f"css:{CONTENT_SELECTOR}", cache_mode=CacheMode.BYPASS,
|
|
180
|
+
)
|
|
181
|
+
for attempt in range(1, CONTENT_RETRY_ATTEMPTS):
|
|
182
|
+
await asyncio.sleep(CONTENT_RETRY_DELAY_SECONDS * attempt)
|
|
183
|
+
result = await crawler.arun(url=url, config=single_config)
|
|
184
|
+
if result.success and not is_broken_content(result.markdown.raw_markdown):
|
|
185
|
+
return result.markdown.raw_markdown
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
async def run_crawl(max_depth: int, max_pages: int) -> RunStats:
|
|
190
|
+
stats = RunStats()
|
|
191
|
+
config = build_deep_crawl_config(max_depth, max_pages)
|
|
192
|
+
|
|
193
|
+
async with AsyncWebCrawler() as crawler:
|
|
194
|
+
stream = await crawler.arun(url=SEED_URL, config=config)
|
|
195
|
+
async for result in stream:
|
|
196
|
+
url = normalize(result.url)
|
|
197
|
+
|
|
198
|
+
if not result.success:
|
|
199
|
+
stats.fetch_failed.append(url)
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
stats.discovered_total += 1
|
|
203
|
+
classification = classify_url(url)
|
|
204
|
+
capability = classification["capability"]
|
|
205
|
+
|
|
206
|
+
if capability is None:
|
|
207
|
+
if not classification["known_out_of_scope"]:
|
|
208
|
+
stats.unmatched.append(url)
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
if classification["prune_reason"] is not None:
|
|
212
|
+
stats.pruned.append((url, classification["prune_reason"]))
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
prefix = CAPABILITIES[capability]
|
|
216
|
+
slug = slugify(url, prefix)
|
|
217
|
+
|
|
218
|
+
markdown = result.markdown.raw_markdown
|
|
219
|
+
if is_broken_content(markdown):
|
|
220
|
+
markdown = await refetch_single_page(crawler, url)
|
|
221
|
+
if markdown is None:
|
|
222
|
+
# Never overwrite a good existing file with a broken
|
|
223
|
+
# fetch -- leave it as-is and flag for a manual retry.
|
|
224
|
+
stats.fetch_failed.append(url)
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
# Pure navigation/TOC pages (per classify_pages.py's heuristic)
|
|
228
|
+
# go to raw/_navigation/ instead of raw/{capability}/, so the
|
|
229
|
+
# corpus a future consultation skill reads stays high-signal.
|
|
230
|
+
total_words, link_ratio = analyze_body(markdown)
|
|
231
|
+
is_navigation = classify_navigation(total_words, link_ratio) == "index"
|
|
232
|
+
|
|
233
|
+
content_path = RAW_DIR / capability / f"{slug}.md"
|
|
234
|
+
navigation_path = NAVIGATION_DIR / capability / f"{slug}.md"
|
|
235
|
+
out_path = navigation_path if is_navigation else content_path
|
|
236
|
+
other_path = content_path if is_navigation else navigation_path
|
|
237
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
238
|
+
|
|
239
|
+
new_content = build_frontmatter(url, capability, markdown) + markdown
|
|
240
|
+
old_hash_line = read_existing_hash(out_path) or read_existing_hash(other_path)
|
|
241
|
+
existed_before = out_path.exists() or other_path.exists()
|
|
242
|
+
out_path.write_text(new_content, encoding="utf-8")
|
|
243
|
+
if other_path.exists():
|
|
244
|
+
other_path.unlink() # reclassified since last run -- drop the stale copy
|
|
245
|
+
new_hash_line = read_existing_hash(out_path)
|
|
246
|
+
|
|
247
|
+
if not existed_before:
|
|
248
|
+
status = "new"
|
|
249
|
+
elif old_hash_line == new_hash_line:
|
|
250
|
+
status = "unchanged"
|
|
251
|
+
else:
|
|
252
|
+
status = "updated"
|
|
253
|
+
stats.outcomes[capability].append(PageOutcome(url, capability, slug, status, is_navigation))
|
|
254
|
+
|
|
255
|
+
return stats
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def read_url_from_file(path: Path) -> str | None:
|
|
259
|
+
with path.open(encoding="utf-8") as f:
|
|
260
|
+
for line in f:
|
|
261
|
+
if line.startswith("url:"):
|
|
262
|
+
return line.strip().removeprefix("url:").strip().strip('"')
|
|
263
|
+
return None
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def is_confirmed_gone(url: str, timeout: float = 10.0) -> bool:
|
|
267
|
+
"""True only if the URL itself, fetched directly (no BFS involved),
|
|
268
|
+
confirms it's actually gone (404/410). Any other outcome -- 200, a
|
|
269
|
+
different error, a timeout, a network hiccup on our end -- is NOT treated
|
|
270
|
+
as confirmation, since BFS link-following can miss pages that are still
|
|
271
|
+
live but just unlinked from wherever our crawl reached this run."""
|
|
272
|
+
request = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Mozilla/5.0"})
|
|
273
|
+
try:
|
|
274
|
+
with urllib.request.urlopen(request, timeout=timeout) as response:
|
|
275
|
+
return False # any successful response means it's still there
|
|
276
|
+
except urllib.error.HTTPError as exc:
|
|
277
|
+
return exc.code in (404, 410)
|
|
278
|
+
except Exception: # noqa: BLE001 - network errors on our side aren't proof of anything
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def quarantine_orphans(stats: RunStats) -> dict:
|
|
283
|
+
"""Move raw/{capability}/*.md and raw/_navigation/{capability}/*.md files
|
|
284
|
+
that this run didn't touch to raw/_removed/{capability}/ -- but only
|
|
285
|
+
after directly confirming the URL is actually gone (see
|
|
286
|
+
is_confirmed_gone). Orphans that turn out to still be live, or that we
|
|
287
|
+
couldn't check, are left in place and reported separately so a human
|
|
288
|
+
can look into the crawl's coverage gap."""
|
|
289
|
+
quarantined: dict[str, list[str]] = {name: [] for name in CAPABILITIES}
|
|
290
|
+
still_alive: dict[str, list[str]] = {name: [] for name in CAPABILITIES}
|
|
291
|
+
skipped_capabilities: list[str] = []
|
|
292
|
+
|
|
293
|
+
for capability in CAPABILITIES:
|
|
294
|
+
content_dir = RAW_DIR / capability
|
|
295
|
+
navigation_dir = NAVIGATION_DIR / capability
|
|
296
|
+
on_disk_paths = {p.stem: p for p in content_dir.glob("*.md")}
|
|
297
|
+
on_disk_paths.update({p.stem: p for p in navigation_dir.glob("*.md")})
|
|
298
|
+
if not on_disk_paths:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Only files untouched by this run's outcomes are orphans.
|
|
302
|
+
current_slugs = {o.slug for o in stats.outcomes[capability]}
|
|
303
|
+
orphans = set(on_disk_paths) - current_slugs
|
|
304
|
+
|
|
305
|
+
previous_count = len(on_disk_paths)
|
|
306
|
+
new_count = len(current_slugs)
|
|
307
|
+
if previous_count > 0 and new_count < QUARANTINE_SAFETY_RATIO * previous_count:
|
|
308
|
+
skipped_capabilities.append(capability)
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
if not orphans:
|
|
312
|
+
continue
|
|
313
|
+
|
|
314
|
+
removed_dir = REMOVED_DIR / capability
|
|
315
|
+
removed_dir.mkdir(parents=True, exist_ok=True)
|
|
316
|
+
removed_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
317
|
+
for slug in sorted(orphans):
|
|
318
|
+
src = on_disk_paths[slug]
|
|
319
|
+
url = read_url_from_file(src)
|
|
320
|
+
if url is None or not is_confirmed_gone(url):
|
|
321
|
+
still_alive[capability].append(slug)
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
dst = removed_dir / f"{slug}.md"
|
|
325
|
+
shutil.move(str(src), str(dst))
|
|
326
|
+
quarantined[capability].append(slug)
|
|
327
|
+
with REMOVED_LOG.open("a", encoding="utf-8") as log:
|
|
328
|
+
log.write(json.dumps({"capability": capability, "slug": slug, "url": url, "removed_at": removed_at}) + "\n")
|
|
329
|
+
|
|
330
|
+
return {"quarantined": quarantined, "still_alive": still_alive, "skipped_capabilities": skipped_capabilities}
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def write_filtered_reports(stats: RunStats) -> None:
|
|
334
|
+
FILTERED_DIR.mkdir(parents=True, exist_ok=True)
|
|
335
|
+
|
|
336
|
+
for capability in CAPABILITIES:
|
|
337
|
+
urls = sorted(o.url for o in stats.outcomes[capability])
|
|
338
|
+
(FILTERED_DIR / f"{capability}_urls.txt").write_text("\n".join(urls) + "\n", encoding="utf-8")
|
|
339
|
+
|
|
340
|
+
pruned_lines = [f"{url}\t# {reason}" for url, reason in sorted(stats.pruned)]
|
|
341
|
+
(FILTERED_DIR / "self-hosted_pruned.txt").write_text(
|
|
342
|
+
"\n".join(pruned_lines) + ("\n" if pruned_lines else ""), encoding="utf-8",
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
navigation_urls = sorted(
|
|
346
|
+
o.url for outcomes in stats.outcomes.values() for o in outcomes if o.is_navigation
|
|
347
|
+
)
|
|
348
|
+
(FILTERED_DIR / "navigation_urls.txt").write_text(
|
|
349
|
+
"\n".join(navigation_urls) + ("\n" if navigation_urls else ""), encoding="utf-8",
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
prune_counts = {label: 0 for label, _ in SELF_HOSTED_PRUNE_RULES}
|
|
353
|
+
for _, reason in stats.pruned:
|
|
354
|
+
prune_counts[reason] += 1
|
|
355
|
+
|
|
356
|
+
summary = {
|
|
357
|
+
"capabilities": {
|
|
358
|
+
name: {
|
|
359
|
+
"unique_urls": len(stats.outcomes[name]),
|
|
360
|
+
"navigation_pages": sum(1 for o in stats.outcomes[name] if o.is_navigation),
|
|
361
|
+
} for name in CAPABILITIES
|
|
362
|
+
},
|
|
363
|
+
"self_hosted_pruned": {"total": len(stats.pruned), "by_rule": prune_counts},
|
|
364
|
+
"total_in_scope": sum(len(stats.outcomes[name]) for name in CAPABILITIES),
|
|
365
|
+
"total_navigation_pages": len(navigation_urls),
|
|
366
|
+
"discovered_total": stats.discovered_total,
|
|
367
|
+
"fetch_failed_count": len(stats.fetch_failed),
|
|
368
|
+
"unmatched_count": len(stats.unmatched),
|
|
369
|
+
}
|
|
370
|
+
(FILTERED_DIR / "summary.json").write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8")
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def print_summary(stats: RunStats, quarantine_result: dict) -> None:
|
|
374
|
+
print(f"\nTotal descubierto bajo /w/dxp: {stats.discovered_total}")
|
|
375
|
+
if stats.fetch_failed:
|
|
376
|
+
print(f"Fallos de fetch: {len(stats.fetch_failed)}")
|
|
377
|
+
for url in stats.fetch_failed:
|
|
378
|
+
print(f" - {url}")
|
|
379
|
+
|
|
380
|
+
print("\nPor capability (nuevas / actualizadas / sin cambios / navegación):")
|
|
381
|
+
total_in_scope = 0
|
|
382
|
+
total_navigation = 0
|
|
383
|
+
for capability, outcomes in stats.outcomes.items():
|
|
384
|
+
new = sum(1 for o in outcomes if o.status == "new")
|
|
385
|
+
updated = sum(1 for o in outcomes if o.status == "updated")
|
|
386
|
+
unchanged = sum(1 for o in outcomes if o.status == "unchanged")
|
|
387
|
+
navigation = sum(1 for o in outcomes if o.is_navigation)
|
|
388
|
+
total_in_scope += len(outcomes)
|
|
389
|
+
total_navigation += navigation
|
|
390
|
+
print(f" {capability:12s}: {len(outcomes):4d} total "
|
|
391
|
+
f"({new} nuevas, {updated} actualizadas, {unchanged} sin cambios, {navigation} navegación)")
|
|
392
|
+
print(f"\nTotal en scope: {total_in_scope} ({total_navigation} en raw/_navigation/, "
|
|
393
|
+
f"{total_in_scope - total_navigation} en raw/{{capability}}/)")
|
|
394
|
+
|
|
395
|
+
print(f"\nSelf-hosted podadas: {len(stats.pruned)}")
|
|
396
|
+
|
|
397
|
+
quarantined = quarantine_result["quarantined"]
|
|
398
|
+
total_quarantined = sum(len(v) for v in quarantined.values())
|
|
399
|
+
print(f"\nEn cuarentena (URL verificada como caída, HTTP 404/410): {total_quarantined}")
|
|
400
|
+
for capability, slugs in quarantined.items():
|
|
401
|
+
if slugs:
|
|
402
|
+
print(f" {capability}: {len(slugs)}")
|
|
403
|
+
for slug in slugs:
|
|
404
|
+
print(f" - {slug}")
|
|
405
|
+
|
|
406
|
+
still_alive = quarantine_result["still_alive"]
|
|
407
|
+
total_still_alive = sum(len(v) for v in still_alive.values())
|
|
408
|
+
if total_still_alive:
|
|
409
|
+
print(f"\nNo redescubiertas por el BFS pero SIGUEN VIVAS (no se tocaron, revisar cobertura del crawl): {total_still_alive}")
|
|
410
|
+
for capability, slugs in still_alive.items():
|
|
411
|
+
if slugs:
|
|
412
|
+
print(f" {capability}: {len(slugs)}")
|
|
413
|
+
for slug in slugs:
|
|
414
|
+
print(f" - {slug}")
|
|
415
|
+
|
|
416
|
+
if quarantine_result["skipped_capabilities"]:
|
|
417
|
+
print("\nADVERTENCIA: cuarentena omitida por caída sospechosa de conteo "
|
|
418
|
+
"(posible crawl incompleto), revisar a mano:")
|
|
419
|
+
for capability in quarantine_result["skipped_capabilities"]:
|
|
420
|
+
print(f" - {capability}")
|
|
421
|
+
|
|
422
|
+
if stats.unmatched:
|
|
423
|
+
print(f"\nURLs raras (ni en scope ni en descartadas conocidas), {len(stats.unmatched)}:")
|
|
424
|
+
for url in stats.unmatched:
|
|
425
|
+
print(f" {url}")
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def main() -> None:
|
|
429
|
+
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
430
|
+
parser.add_argument("--max-depth", type=int, default=DEFAULT_MAX_DEPTH)
|
|
431
|
+
parser.add_argument("--max-pages", type=int, default=DEFAULT_MAX_PAGES)
|
|
432
|
+
parser.add_argument("--skip-regression-check", action="store_true",
|
|
433
|
+
help="Skip the post-run check_regressions.run_check() call (e.g. no git repo yet).")
|
|
434
|
+
args = parser.parse_args()
|
|
435
|
+
|
|
436
|
+
stats = asyncio.run(run_crawl(args.max_depth, args.max_pages))
|
|
437
|
+
quarantine_result = quarantine_orphans(stats)
|
|
438
|
+
write_filtered_reports(stats)
|
|
439
|
+
print_summary(stats, quarantine_result)
|
|
440
|
+
|
|
441
|
+
suspicious = False
|
|
442
|
+
if not args.skip_regression_check:
|
|
443
|
+
print("\n--- Verificación de regresiones (contra el último commit) ---")
|
|
444
|
+
suspicious = run_regression_check()
|
|
445
|
+
|
|
446
|
+
if stats.fetch_failed or suspicious:
|
|
447
|
+
sys.exit(1)
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
if __name__ == "__main__":
|
|
451
|
+
main()
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: liferay-docs-scraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scrape learn.liferay.com/w/dxp into a local Markdown corpus (raw/{capability}/*.md) for the liferay-expert Claude Code skill.
|
|
5
|
+
Requires-Python: <3.14,>=3.10
|
|
6
|
+
Requires-Dist: crawl4ai>=0.9.0
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
|
|
9
|
+
# liferay-docs-scraper
|
|
10
|
+
|
|
11
|
+
Scrapes `learn.liferay.com/w/dxp/*` into a local, clean Markdown corpus
|
|
12
|
+
(`raw/{capability}/*.md`) and ships a Claude Code skill (`liferay-expert`)
|
|
13
|
+
that answers Liferay DXP questions by searching and citing that corpus.
|
|
14
|
+
|
|
15
|
+
**This repo does not ship Liferay's documentation.** It ships the code that
|
|
16
|
+
scrapes it, and a skill that reads whatever you scrape locally. Each user
|
|
17
|
+
builds and refreshes their own copy directly from learn.liferay.com.
|
|
18
|
+
|
|
19
|
+
## Quickstart
|
|
20
|
+
|
|
21
|
+
The recommended order for a first-time setup: scrape, then install the
|
|
22
|
+
skill, then ask questions.
|
|
23
|
+
|
|
24
|
+
**1. Build the corpus (one-time, ~30-40 min):**
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
uvx --from crawl4ai crawl4ai-setup # one-time, installs Playwright browsers
|
|
28
|
+
uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Run this from anywhere -- it does not write into your current directory,
|
|
32
|
+
see "Reference: the scraper in detail" below for exactly where it goes.
|
|
33
|
+
|
|
34
|
+
**2. Install the skill into whatever project you're working in:**
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
npx skills add mordonez/liferay-docs-scraper --skill liferay-expert -a claude-code
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
You'll see:
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
◇ Installed 1 skill ───────────────────╮
|
|
44
|
+
│ │
|
|
45
|
+
│ ✓ liferay-expert (copied) │
|
|
46
|
+
│ → ./.claude/skills/liferay-expert │
|
|
47
|
+
│ │
|
|
48
|
+
├───────────────────────────────────────╯
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**3. Ask Claude Code a Liferay question**, e.g. "how do I configure a
|
|
52
|
+
synonym set in Liferay search?" The skill finds the corpus, greps the
|
|
53
|
+
`search` capability, reads `search-administration-and-tuning-synonym-sets.md`,
|
|
54
|
+
and answers grounded in that page -- citing
|
|
55
|
+
`https://learn.liferay.com/w/dxp/search/search-administration-and-tuning/synonym-sets`
|
|
56
|
+
as the source.
|
|
57
|
+
|
|
58
|
+
The corpus is shared across every project where you install the skill (see
|
|
59
|
+
"OS default location" below), so step 1 is only ever needed once per
|
|
60
|
+
machine -- rerun it later just to refresh, not per-project.
|
|
61
|
+
|
|
62
|
+
**If you install the skill without doing step 1 first** (or its corpus goes
|
|
63
|
+
stale), it notices and tells you what to run rather than guessing or
|
|
64
|
+
answering ungrounded -- it never launches the ~30-40 min scrape on its own
|
|
65
|
+
mid-conversation. See "Step 1/2" in `skills/liferay-expert/SKILL.md` for
|
|
66
|
+
that check.
|
|
67
|
+
|
|
68
|
+
## Reference: the scraper in detail
|
|
69
|
+
|
|
70
|
+
Requires Python 3.10-3.13 (crawl4ai's Playwright dependency doesn't yet
|
|
71
|
+
support 3.14) and [uv](https://docs.astral.sh/uv/).
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# One-time: installs the Playwright/Chromium browser crawl4ai drives
|
|
75
|
+
uvx --from crawl4ai crawl4ai-setup
|
|
76
|
+
|
|
77
|
+
# From anywhere -- the corpus does NOT go in your current directory.
|
|
78
|
+
# Not on PyPI yet, so install straight from GitHub:
|
|
79
|
+
uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This takes roughly 30-40 minutes (BFS deep crawl of ~1900 pages across 14
|
|
83
|
+
capabilities) and writes to one shared, per-user location (so it's the same
|
|
84
|
+
corpus no matter which project you're in when the skill looks for it):
|
|
85
|
+
|
|
86
|
+
| OS | Default location |
|
|
87
|
+
|---|---|
|
|
88
|
+
| macOS | `~/Library/Application Support/liferay-docs/` |
|
|
89
|
+
| Linux | `~/.local/share/liferay-docs/` (or `$XDG_DATA_HOME/liferay-docs`) |
|
|
90
|
+
| Windows | `%LOCALAPPDATA%\liferay-docs\` |
|
|
91
|
+
|
|
92
|
+
Set `LIFERAY_DOCS_DIR` to override (e.g. to keep a project-local copy instead).
|
|
93
|
+
|
|
94
|
+
Inside that directory:
|
|
95
|
+
|
|
96
|
+
- `raw/{capability}/*.md` — the corpus, one file per page
|
|
97
|
+
- `raw/_navigation/{capability}/*.md` — pure TOC pages, kept but deprioritized
|
|
98
|
+
- `raw/_removed/{capability}/*.md` — pages confirmed gone from the live site
|
|
99
|
+
- `reports/filtered/` — URL manifests, self-hosted prune log, run summary
|
|
100
|
+
|
|
101
|
+
Re-run it anytime (weekly recommended) to refresh: it starts from zero every
|
|
102
|
+
time, so it naturally picks up new pages, updates changed ones, and
|
|
103
|
+
quarantines (never deletes) removed ones. If that directory is (or becomes)
|
|
104
|
+
a git repo -- worth doing once, purely as a local diffing tool, nothing needs
|
|
105
|
+
pushing anywhere -- it also runs `check-regressions` automatically afterward
|
|
106
|
+
and flags any file that shrank by more than half or grew more than 3x versus
|
|
107
|
+
the last commit (signals of a broken fetch); see
|
|
108
|
+
`docs/adr/0001-crawl4ai-based-corpus-pipeline.md` for why that check exists.
|
|
109
|
+
|
|
110
|
+
## Reference: the skill in detail
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
npx skills add mordonez/liferay-docs-scraper --skill liferay-expert
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Or just copy `skills/liferay-expert/SKILL.md` into `.claude/skills/liferay-expert/`
|
|
117
|
+
in any project. Claude Code picks it up automatically; the skill itself
|
|
118
|
+
resolves `$LIFERAY_DOCS_DIR` (or the OS default above) to find the corpus,
|
|
119
|
+
so it works the same regardless of which project you installed it into.
|
|
120
|
+
|
|
121
|
+
## Why no bundled docs, no embeddings, no vector DB
|
|
122
|
+
|
|
123
|
+
See `docs/adr/` for the full reasoning. Short version: the corpus is
|
|
124
|
+
Liferay's copyrighted documentation text -- distributing the *tool* that
|
|
125
|
+
scrapes public pages is a different, much lower-risk thing than a third
|
|
126
|
+
party redistributing that text at scale. Plain grep + Read over ~1800
|
|
127
|
+
well-organized Markdown files is fast enough that no search index is needed;
|
|
128
|
+
add one later if that stops being true.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
liferay_docs_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
liferay_docs_scraper/check_regressions.py,sha256=_7k-5wNuxrhRJTbggUufwhcH8ECXByjUUXm_GELwHQ8,4114
|
|
3
|
+
liferay_docs_scraper/classify_pages.py,sha256=yXo6eKLlsFpoM1CKzcZUNEWBN8RcQ1Nc_2qNgqnkLLs,1448
|
|
4
|
+
liferay_docs_scraper/filter_urls.py,sha256=kHrskK2xyPO9tKCrjJUepLEN5k5xunG9I6iFg2J3fxU,5615
|
|
5
|
+
liferay_docs_scraper/pipeline.py,sha256=pnivtmSXP7oGCjV7Ta52ZEjd1655tJMoveTKU7GcWqQ,19959
|
|
6
|
+
liferay_docs_scraper-0.1.0.dist-info/METADATA,sha256=fSSrsFCFYg_Gbs8GEXcJfxR3kZaWDyqM0AFKUEDpzOc,5522
|
|
7
|
+
liferay_docs_scraper-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
8
|
+
liferay_docs_scraper-0.1.0.dist-info/entry_points.txt,sha256=TFz1Q5Aj4QrlDOMZFWCHCN4qKDTqEG985t0JhoLYJVU,140
|
|
9
|
+
liferay_docs_scraper-0.1.0.dist-info/RECORD,,
|