liferay-docs-scraper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env python3
2
+ """Flag suspicious content changes in raw/ after a crawl4ai refresh, using git.
3
+
4
+ Compares the working tree against a given git ref (default: HEAD, i.e. the
5
+ last commit) for every raw/**/*.md file:
6
+ - Shrank a lot (body text, not counting frontmatter): the signature of a
7
+ broken/partial fetch overwriting a good page.
8
+ - Grew a lot: the signature of CONTENT_SELECTOR failing to match and
9
+ crawl4ai falling back to the whole page (breadcrumb/nav/footer chrome
10
+ and all) instead of raising an error.
11
+
12
+ Operates on filter_urls.resolve_docs_dir() (the same shared corpus location
13
+ the scraper writes to and the skill reads from), not the current directory.
14
+
15
+ Usage:
16
+ uvx --from liferay-docs-scraper check-regressions [--ref HEAD] [--shrink-threshold 0.5] [--growth-threshold 3.0]
17
+ """
18
+
19
+ import argparse
20
+ import subprocess
21
+
22
+ from .filter_urls import resolve_docs_dir
23
+
24
+ ROOT = resolve_docs_dir()
25
+
26
+
27
+ def body_len(text: str) -> int:
28
+ """Length of the file excluding the YAML frontmatter block."""
29
+ parts = text.split("---\n", 2)
30
+ body = parts[2] if len(parts) == 3 else text
31
+ return len(body)
32
+
33
+
34
+ def git_show(ref: str, path: str) -> str | None:
35
+ result = subprocess.run(
36
+ ["git", "show", f"{ref}:{path}"], cwd=ROOT, capture_output=True, text=True,
37
+ )
38
+ if result.returncode != 0:
39
+ return None
40
+ return result.stdout
41
+
42
+
43
+ def changed_raw_files(ref: str) -> list[str]:
44
+ result = subprocess.run(
45
+ ["git", "diff", "--name-only", ref, "--", "raw/"],
46
+ cwd=ROOT, capture_output=True, text=True, check=True,
47
+ )
48
+ return [line for line in result.stdout.splitlines() if line.endswith(".md")]
49
+
50
+
51
+ def run_check(ref: str = "HEAD", shrink_threshold: float = 0.5, growth_threshold: float = 3.0) -> bool:
52
+ """Print the regression report; return True iff something looked suspicious."""
53
+ changed = changed_raw_files(ref)
54
+ print(f"Archivos .md cambiados en raw/ vs {ref}: {len(changed)}")
55
+
56
+ shrunk, grew = [], []
57
+ for rel_path in changed:
58
+ full_path = ROOT / rel_path
59
+ old_text = git_show(ref, rel_path)
60
+ if old_text is None:
61
+ continue # new file, nothing to compare
62
+ if not full_path.exists():
63
+ continue # deleted/moved (e.g. quarantined), handled separately
64
+
65
+ new_text = full_path.read_text(encoding="utf-8")
66
+ old_len = body_len(old_text)
67
+ new_len = body_len(new_text)
68
+ if old_len == 0:
69
+ continue
70
+ ratio = new_len / old_len
71
+ if ratio < shrink_threshold:
72
+ shrunk.append((rel_path, old_len, new_len, ratio))
73
+ elif ratio > growth_threshold:
74
+ grew.append((rel_path, old_len, new_len, ratio))
75
+
76
+ if shrunk:
77
+ print(f"\nSOSPECHOSOS ({len(shrunk)}) -- perdieron más del "
78
+ f"{(1 - shrink_threshold) * 100:.0f}% del contenido:")
79
+ for rel_path, old_len, new_len, ratio in sorted(shrunk, key=lambda x: x[3]):
80
+ print(f" {rel_path}: {old_len} -> {new_len} chars ({ratio:.0%})")
81
+ else:
82
+ print("\nNinguno por debajo del umbral de encogimiento -- sin señales de pérdida de contenido.")
83
+
84
+ if grew:
85
+ print(f"\nSOSPECHOSOS ({len(grew)}) -- crecieron más de {growth_threshold:.0f}x "
86
+ f"(posible fallback a la página completa sin selector):")
87
+ for rel_path, old_len, new_len, ratio in sorted(grew, key=lambda x: -x[3]):
88
+ print(f" {rel_path}: {old_len} -> {new_len} chars ({ratio:.1f}x)")
89
+
90
+ return bool(shrunk or grew)
91
+
92
+
93
+ def main() -> None:
94
+ parser = argparse.ArgumentParser(description=__doc__)
95
+ parser.add_argument("--ref", default="HEAD")
96
+ parser.add_argument("--shrink-threshold", type=float, default=0.5,
97
+ help="Flag files whose body shrank below this fraction of the original.")
98
+ parser.add_argument("--growth-threshold", type=float, default=3.0,
99
+ help="Flag files whose body grew beyond this multiple of the original.")
100
+ args = parser.parse_args()
101
+ run_check(args.ref, args.shrink_threshold, args.growth_threshold)
102
+
103
+
104
+ if __name__ == "__main__":
105
+ main()
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env python3
2
+ """Navigation-vs-content heuristic, shared with crawl4ai_pipeline.py.
3
+
4
+ An "index"/navigation page is one whose body (frontmatter stripped) is
5
+ short and consists mostly of links to subpages -- no substantial technical
6
+ content of its own. Everything else is "content".
7
+
8
+ Heuristic (no API calls, pure text analysis):
9
+ - Strip Markdown link syntax down to visible text (`[text](url)` -> `text`)
10
+ and strip heading/emphasis markup (`#`, `*`, `_`, backticks).
11
+ - total_words: word count of that visible text.
12
+ - link_ratio: fraction of those words that come from inside a Markdown
13
+ link's link-text span.
14
+ - "index" iff total_words < INDEX_MAX_WORDS and link_ratio >= INDEX_MIN_LINK_RATIO.
15
+ """
16
+
17
+ import re
18
+
19
+ INDEX_MAX_WORDS = 150
20
+ INDEX_MIN_LINK_RATIO = 0.5
21
+
22
+ LINK_RE = re.compile(r"\[([^\]]*)\]\(([^)]*)\)")
23
+ MARKUP_RE = re.compile(r"[#*_`\\]")
24
+
25
+
26
+ def analyze_body(body: str) -> tuple[int, float]:
27
+ links = LINK_RE.findall(body)
28
+ link_word_count = sum(len(text.split()) for text, _url in links)
29
+
30
+ visible = LINK_RE.sub(lambda m: m.group(1), body)
31
+ visible = MARKUP_RE.sub("", visible)
32
+ total_words = len(visible.split())
33
+
34
+ link_ratio = (link_word_count / total_words) if total_words else 0.0
35
+ return total_words, link_ratio
36
+
37
+
38
+ def classify(total_words: int, link_ratio: float) -> str:
39
+ if total_words < INDEX_MAX_WORDS and link_ratio >= INDEX_MIN_LINK_RATIO:
40
+ return "index"
41
+ return "content"
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env python3
2
+ """Shared URL/capability utilities for the crawl4ai_pipeline.py corpus.
3
+
4
+ Capability classification (matching learn.liferay.com/w/dxp URLs to one of
5
+ the 14 capabilities listed on /w/dxp/index, plus the self-hosted prune
6
+ rules), the URL->filename/frontmatter helpers used when writing pages to
7
+ raw/{capability}/*.md, and resolve_docs_dir() -- the one place that decides
8
+ where that raw/ corpus actually lives on disk.
9
+ """
10
+
11
+ import hashlib
12
+ import os
13
+ import sys
14
+ from datetime import datetime, timezone
15
+ from pathlib import Path
16
+ from urllib.parse import urlparse
17
+
18
+ CAPABILITIES = {
19
+ "cloud": "/w/dxp/cloud",
20
+ "search": "/w/dxp/search",
21
+ "self-hosted": "/w/dxp/self-hosted-installation-and-upgrades",
22
+ "sites": "/w/dxp/sites",
23
+ "security": "/w/dxp/security-and-administration",
24
+ "development": "/w/dxp/development",
25
+ "commerce": "/w/dxp/commerce",
26
+ "personalization": "/w/dxp/personalization",
27
+ "low-code": "/w/dxp/low-code",
28
+ "content-management-system": "/w/dxp/content-management-system",
29
+ "digital-asset-management": "/w/dxp/digital-asset-management",
30
+ "integration": "/w/dxp/integration",
31
+ "ai": "/w/dxp/ai",
32
+ "getting-started": "/w/dxp/getting-started",
33
+ }
34
+
35
+ # All 14 capabilities listed on https://learn.liferay.com/w/dxp/index are in
36
+ # scope now; nothing under /w/dxp is deliberately excluded anymore.
37
+ OUT_OF_SCOPE_PREFIXES: list[str] = []
38
+
39
+ # (rule label, substring whose presence -- followed by more path -- excludes the URL)
40
+ SELF_HOSTED_PRUNE_RULES = [
41
+ (
42
+ "deprecations-and-breaking-changes-reference subpage",
43
+ "/upgrading-liferay/deprecations-and-breaking-changes-reference/",
44
+ ),
45
+ (
46
+ "installing-earlier-liferay-versions-on-application-servers subpage",
47
+ "/installing-earlier-liferay-versions-on-application-servers/",
48
+ ),
49
+ (
50
+ "cne-aws-ready subpage",
51
+ "/cloud-native-experience/cne-cloud-provider-ready/cne-aws-ready/",
52
+ ),
53
+ (
54
+ "cne-gcp-ready subpage",
55
+ "/cloud-native-experience/cne-cloud-provider-ready/cne-gcp-ready/",
56
+ ),
57
+ ]
58
+
59
+
60
+ def normalize(url: str) -> str:
61
+ """Strip a trailing slash from the path, keep everything else as-is."""
62
+ parsed = urlparse(url)
63
+ path = parsed.path
64
+ if path != "/" and path.endswith("/"):
65
+ path = path[:-1]
66
+ return parsed._replace(path=path).geturl()
67
+
68
+
69
+ def matches_prefix(path: str, prefix: str) -> bool:
70
+ return path == prefix or path.startswith(prefix + "/")
71
+
72
+
73
+ def prune_reason(path: str) -> str | None:
74
+ for label, substr in SELF_HOSTED_PRUNE_RULES:
75
+ if substr in path:
76
+ return label
77
+ return None
78
+
79
+
80
+ def classify_url(url: str) -> dict:
81
+ """Classify a single (already-normalized) URL for the capability pipeline.
82
+
83
+ Returns a dict with:
84
+ - capability: matched capability name, or None if out of scope
85
+ - prune_reason: self-hosted prune rule label, or None
86
+ - known_out_of_scope: True if it matches one of the known-excluded
87
+ capabilities rather than being an unrecognized/"odd" URL worth
88
+ flagging for manual review
89
+ """
90
+ path = urlparse(url).path
91
+ matched_capability = None
92
+ for name, prefix in CAPABILITIES.items():
93
+ if matches_prefix(path, prefix):
94
+ matched_capability = name
95
+ break
96
+
97
+ if matched_capability is None:
98
+ known_out_of_scope = any(matches_prefix(path, prefix) for prefix in OUT_OF_SCOPE_PREFIXES)
99
+ return {"capability": None, "prune_reason": None, "known_out_of_scope": known_out_of_scope}
100
+
101
+ reason = prune_reason(path) if matched_capability == "self-hosted" else None
102
+ return {"capability": matched_capability, "prune_reason": reason, "known_out_of_scope": False}
103
+
104
+
105
+ def slugify(url: str, prefix: str) -> str:
106
+ """URL path (with the capability prefix stripped) -> a flat filename stem."""
107
+ path = urlparse(url).path
108
+ remainder = path[len(prefix):].strip("/")
109
+ if not remainder:
110
+ return "index"
111
+ return remainder.replace("/", "-")
112
+
113
+
114
+ def build_frontmatter(url: str, capability: str, markdown: str) -> str:
115
+ fetched_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
116
+ content_hash = hashlib.sha256(markdown.encode("utf-8")).hexdigest()
117
+ lines = [
118
+ "---",
119
+ f'url: "{url}"',
120
+ f"capability: {capability}",
121
+ f'fetched_at: "{fetched_at}"',
122
+ f'content_hash: "sha256:{content_hash}"',
123
+ "---",
124
+ "",
125
+ ]
126
+ return "\n".join(lines)
127
+
128
+
129
+ def _default_data_dir() -> Path:
130
+ """Per-user app-data directory, one convention per OS, so the corpus
131
+ lives in the same predictable place regardless of which project you
132
+ happen to be running the scraper or the skill from."""
133
+ if sys.platform == "win32":
134
+ base = os.environ.get("LOCALAPPDATA") or str(Path.home() / "AppData" / "Local")
135
+ return Path(base) / "liferay-docs"
136
+ if sys.platform == "darwin":
137
+ return Path.home() / "Library" / "Application Support" / "liferay-docs"
138
+ # Linux and other Unix-likes: XDG Base Directory spec
139
+ base = os.environ.get("XDG_DATA_HOME") or str(Path.home() / ".local" / "share")
140
+ return Path(base) / "liferay-docs"
141
+
142
+
143
+ def resolve_docs_dir() -> Path:
144
+ """Where the local corpus (raw/, reports/filtered/) lives: $LIFERAY_DOCS_DIR
145
+ if set, otherwise the OS-appropriate default data directory. One shared
146
+ corpus regardless of the current project, unless explicitly overridden --
147
+ see _default_data_dir() for the per-OS default."""
148
+ override = os.environ.get("LIFERAY_DOCS_DIR")
149
+ if override:
150
+ return Path(override).expanduser()
151
+ return _default_data_dir()
@@ -0,0 +1,451 @@
1
+ #!/usr/bin/env python3
2
+ """Weekly from-scratch refresh of the learn.liferay.com/w/dxp corpus, crawl4ai-only.
3
+
4
+ Builds raw/{capability}/*.md under filter_urls.resolve_docs_dir(): the
5
+ $LIFERAY_DOCS_DIR directory if that env var is set, otherwise one shared,
6
+ OS-appropriate per-user data directory (e.g. ~/Library/Application
7
+ Support/liferay-docs on macOS, %LOCALAPPDATA%\\liferay-docs on Windows,
8
+ ~/.local/share/liferay-docs on Linux). Deliberately NOT the current working
9
+ directory -- the liferay-expert skill looks in that same shared location
10
+ regardless of which project you're in when you ask a question, so you
11
+ don't end up with a separate copy of the corpus per project.
12
+
13
+ A single crawl4ai deep crawl handles both URL discovery and content
14
+ extraction:
15
+
16
+ - A BFS deep crawl starts at /w/dxp/index and follows every internal link
17
+ under /w/dxp/*. crawl4ai extracts links from the FULL page regardless of
18
+ css_selector, so this single crawl gets us both (a) the complete current
19
+ set of URLs on the site and (b) each page's Markdown scoped to
20
+ CONTENT_SELECTOR, in one visit per page. That selector (see below) is
21
+ precise enough that no further chrome-stripping is needed -- what
22
+ crawl4ai returns is already the final page content.
23
+ - Each page is classified with filter_urls.py's classify_url (capability
24
+ prefixes + self-hosted prune rules) and, if in scope, written to
25
+ raw/{capability}/{slug}.md -- unless classify_pages.py's heuristic
26
+ (reused here, not duplicated) says it's a pure navigation/TOC page with
27
+ no substantial content of its own, in which case it goes to
28
+ raw/_navigation/{capability}/{slug}.md instead. This keeps
29
+ raw/{capability}/ as signal for the liferay-expert skill, while still
30
+ preserving the navigation pages (not deleting them) in case they're
31
+ useful later.
32
+ - Because every run starts from zero, a page that existed last run but
33
+ isn't found this run (removed from the site, or now out of scope/pruned)
34
+ is a *candidate* for quarantine -- but BFS link-following can miss a page
35
+ that's still live (no longer linked from anywhere our crawl reached,
36
+ while still resolving directly), so before quarantining anything we do a
37
+ direct HTTP check on each candidate's own URL. Only a confirmed non-200
38
+ gets quarantined (moved to raw/_removed/{capability}/{slug}.md, logged to
39
+ reports/filtered/removed_log.jsonl); anything that still responds, or
40
+ that we simply couldn't reach to check, is left in place and flagged for
41
+ manual review instead. If a capability's discovered count drops
42
+ implausibly (crawl likely failed partway), quarantine for that capability
43
+ is skipped entirely and flagged for manual review instead of trusting a
44
+ possibly-broken run.
45
+ - reports/filtered/{capability}_urls.txt, self-hosted_pruned.txt and
46
+ summary.json are regenerated from this run's live results, so they always
47
+ reflect the current corpus (same format filter_urls.py produces).
48
+ - Once everything above is written, check_regressions.py's run_check()
49
+ runs automatically against the last git commit, if resolve_docs_dir()
50
+ is itself a git repo (worth `git init`-ing once, purely as a local
51
+ diffing tool -- nothing needs to be pushed anywhere). Skipped otherwise,
52
+ or with --skip-regression-check.
53
+
54
+ Setup and run (see README.md for the full explanation):
55
+ uvx --from crawl4ai crawl4ai-setup # one-time: installs Playwright browsers
56
+ uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
57
+ # (not on PyPI yet -- once published, just `uvx liferay-docs-scraper`)
58
+ """
59
+
60
+ import argparse
61
+ import asyncio
62
+ import json
63
+ import shutil
64
+ import sys
65
+ import urllib.error
66
+ import urllib.request
67
+ from dataclasses import dataclass, field
68
+ from datetime import datetime, timezone
69
+ from pathlib import Path
70
+
71
+ from .check_regressions import run_check as run_regression_check
72
+ from .classify_pages import analyze_body, classify as classify_navigation
73
+ from .filter_urls import (
74
+ CAPABILITIES,
75
+ SELF_HOSTED_PRUNE_RULES,
76
+ build_frontmatter,
77
+ classify_url,
78
+ normalize,
79
+ resolve_docs_dir,
80
+ slugify,
81
+ )
82
+
83
+ from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
84
+ from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, ContentTypeFilter, DomainFilter, FilterChain, URLPatternFilter
85
+
86
+ ROOT = resolve_docs_dir()
87
+ RAW_DIR = ROOT / "raw"
88
+ REMOVED_DIR = RAW_DIR / "_removed"
89
+ NAVIGATION_DIR = RAW_DIR / "_navigation"
90
+ FILTERED_DIR = ROOT / "reports" / "filtered"
91
+ REMOVED_LOG = FILTERED_DIR / "removed_log.jsonl"
92
+
93
+ SEED_URL = "https://learn.liferay.com/w/dxp/index"
94
+ ALLOWED_DOMAIN = "learn.liferay.com"
95
+ URL_SCOPE_PATTERN = "*/w/dxp*"
96
+ # learn.liferay.com's article template puts the breadcrumb, sidebar TOC, and
97
+ # the actual article body all inside #main-content, with the maintenance
98
+ # banner and global footer outside it. .learn-article-content is scoped
99
+ # tighter still: just the title, body, and resource-type tags -- no
100
+ # breadcrumb/TOC/"Submit Feedback" chrome to strip afterward.
101
+ CONTENT_SELECTOR = ".learn-article-content"
102
+
103
+ DEFAULT_MAX_DEPTH = 12
104
+ DEFAULT_MAX_PAGES = 3000
105
+ # If a capability's freshly discovered URL count falls below this fraction of
106
+ # its previous count, treat the run as suspect and skip quarantining orphans
107
+ # for that capability rather than mass-deleting good content on a bad crawl.
108
+ QUARANTINE_SAFETY_RATIO = 0.5
109
+
110
+ # Some fetches render a client-side error banner instead of the real page
111
+ # (transient rendering/server hiccup) -- crawl4ai still reports these as a
112
+ # "successful" fetch, so we have to catch it ourselves and retry.
113
+ ERROR_MARKERS = ["An unexpected error occurred."]
114
+ MIN_ACCEPTABLE_BODY_LENGTH = 30
115
+ CONTENT_RETRY_ATTEMPTS = 3
116
+ CONTENT_RETRY_DELAY_SECONDS = 3.0
117
+
118
+
119
+ @dataclass
120
+ class PageOutcome:
121
+ url: str
122
+ capability: str
123
+ slug: str
124
+ status: str # "new" | "updated" | "unchanged"
125
+ is_navigation: bool = False
126
+
127
+
128
+ @dataclass
129
+ class RunStats:
130
+ discovered_total: int = 0
131
+ fetch_failed: list[str] = field(default_factory=list)
132
+ unmatched: list[str] = field(default_factory=list)
133
+ pruned: list[tuple] = field(default_factory=list)
134
+ outcomes: dict = field(default_factory=lambda: {name: [] for name in CAPABILITIES})
135
+
136
+
137
+ def read_existing_hash(path: Path) -> str | None:
138
+ if not path.exists():
139
+ return None
140
+ with path.open(encoding="utf-8") as f:
141
+ for line in f:
142
+ if line.startswith("content_hash:"):
143
+ return line.strip()
144
+ return None
145
+
146
+
147
+ def is_broken_content(markdown: str) -> bool:
148
+ """Detect a client-side error banner or a suspiciously empty fetch."""
149
+ stripped = markdown.strip()
150
+ if len(stripped) < MIN_ACCEPTABLE_BODY_LENGTH:
151
+ return True
152
+ return any(marker in stripped for marker in ERROR_MARKERS)
153
+
154
+
155
+ def build_deep_crawl_config(max_depth: int, max_pages: int) -> CrawlerRunConfig:
156
+ filter_chain = FilterChain([
157
+ DomainFilter(allowed_domains=[ALLOWED_DOMAIN]),
158
+ URLPatternFilter(patterns=[URL_SCOPE_PATTERN]),
159
+ ContentTypeFilter(allowed_types=["text/html"]),
160
+ ])
161
+ strategy = BFSDeepCrawlStrategy(
162
+ max_depth=max_depth, filter_chain=filter_chain, max_pages=max_pages, include_external=False,
163
+ )
164
+ return CrawlerRunConfig(
165
+ deep_crawl_strategy=strategy,
166
+ css_selector=CONTENT_SELECTOR,
167
+ wait_for=f"css:{CONTENT_SELECTOR}",
168
+ cache_mode=CacheMode.BYPASS,
169
+ stream=True,
170
+ verbose=False,
171
+ )
172
+
173
+
174
+ async def refetch_single_page(crawler: AsyncWebCrawler, url: str) -> str | None:
175
+ """Re-fetch one URL outside the deep crawl (used when the deep crawl's
176
+ copy looked broken). Returns the page's Markdown, or None if every
177
+ attempt still looks broken."""
178
+ single_config = CrawlerRunConfig(
179
+ css_selector=CONTENT_SELECTOR, wait_for=f"css:{CONTENT_SELECTOR}", cache_mode=CacheMode.BYPASS,
180
+ )
181
+ for attempt in range(1, CONTENT_RETRY_ATTEMPTS):
182
+ await asyncio.sleep(CONTENT_RETRY_DELAY_SECONDS * attempt)
183
+ result = await crawler.arun(url=url, config=single_config)
184
+ if result.success and not is_broken_content(result.markdown.raw_markdown):
185
+ return result.markdown.raw_markdown
186
+ return None
187
+
188
+
189
+ async def run_crawl(max_depth: int, max_pages: int) -> RunStats:
190
+ stats = RunStats()
191
+ config = build_deep_crawl_config(max_depth, max_pages)
192
+
193
+ async with AsyncWebCrawler() as crawler:
194
+ stream = await crawler.arun(url=SEED_URL, config=config)
195
+ async for result in stream:
196
+ url = normalize(result.url)
197
+
198
+ if not result.success:
199
+ stats.fetch_failed.append(url)
200
+ continue
201
+
202
+ stats.discovered_total += 1
203
+ classification = classify_url(url)
204
+ capability = classification["capability"]
205
+
206
+ if capability is None:
207
+ if not classification["known_out_of_scope"]:
208
+ stats.unmatched.append(url)
209
+ continue
210
+
211
+ if classification["prune_reason"] is not None:
212
+ stats.pruned.append((url, classification["prune_reason"]))
213
+ continue
214
+
215
+ prefix = CAPABILITIES[capability]
216
+ slug = slugify(url, prefix)
217
+
218
+ markdown = result.markdown.raw_markdown
219
+ if is_broken_content(markdown):
220
+ markdown = await refetch_single_page(crawler, url)
221
+ if markdown is None:
222
+ # Never overwrite a good existing file with a broken
223
+ # fetch -- leave it as-is and flag for a manual retry.
224
+ stats.fetch_failed.append(url)
225
+ continue
226
+
227
+ # Pure navigation/TOC pages (per classify_pages.py's heuristic)
228
+ # go to raw/_navigation/ instead of raw/{capability}/, so the
229
+ # corpus a future consultation skill reads stays high-signal.
230
+ total_words, link_ratio = analyze_body(markdown)
231
+ is_navigation = classify_navigation(total_words, link_ratio) == "index"
232
+
233
+ content_path = RAW_DIR / capability / f"{slug}.md"
234
+ navigation_path = NAVIGATION_DIR / capability / f"{slug}.md"
235
+ out_path = navigation_path if is_navigation else content_path
236
+ other_path = content_path if is_navigation else navigation_path
237
+ out_path.parent.mkdir(parents=True, exist_ok=True)
238
+
239
+ new_content = build_frontmatter(url, capability, markdown) + markdown
240
+ old_hash_line = read_existing_hash(out_path) or read_existing_hash(other_path)
241
+ existed_before = out_path.exists() or other_path.exists()
242
+ out_path.write_text(new_content, encoding="utf-8")
243
+ if other_path.exists():
244
+ other_path.unlink() # reclassified since last run -- drop the stale copy
245
+ new_hash_line = read_existing_hash(out_path)
246
+
247
+ if not existed_before:
248
+ status = "new"
249
+ elif old_hash_line == new_hash_line:
250
+ status = "unchanged"
251
+ else:
252
+ status = "updated"
253
+ stats.outcomes[capability].append(PageOutcome(url, capability, slug, status, is_navigation))
254
+
255
+ return stats
256
+
257
+
258
+ def read_url_from_file(path: Path) -> str | None:
259
+ with path.open(encoding="utf-8") as f:
260
+ for line in f:
261
+ if line.startswith("url:"):
262
+ return line.strip().removeprefix("url:").strip().strip('"')
263
+ return None
264
+
265
+
266
+ def is_confirmed_gone(url: str, timeout: float = 10.0) -> bool:
267
+ """True only if the URL itself, fetched directly (no BFS involved),
268
+ confirms it's actually gone (404/410). Any other outcome -- 200, a
269
+ different error, a timeout, a network hiccup on our end -- is NOT treated
270
+ as confirmation, since BFS link-following can miss pages that are still
271
+ live but just unlinked from wherever our crawl reached this run."""
272
+ request = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Mozilla/5.0"})
273
+ try:
274
+ with urllib.request.urlopen(request, timeout=timeout) as response:
275
+ return False # any successful response means it's still there
276
+ except urllib.error.HTTPError as exc:
277
+ return exc.code in (404, 410)
278
+ except Exception: # noqa: BLE001 - network errors on our side aren't proof of anything
279
+ return False
280
+
281
+
282
+ def quarantine_orphans(stats: RunStats) -> dict:
283
+ """Move raw/{capability}/*.md and raw/_navigation/{capability}/*.md files
284
+ that this run didn't touch to raw/_removed/{capability}/ -- but only
285
+ after directly confirming the URL is actually gone (see
286
+ is_confirmed_gone). Orphans that turn out to still be live, or that we
287
+ couldn't check, are left in place and reported separately so a human
288
+ can look into the crawl's coverage gap."""
289
+ quarantined: dict[str, list[str]] = {name: [] for name in CAPABILITIES}
290
+ still_alive: dict[str, list[str]] = {name: [] for name in CAPABILITIES}
291
+ skipped_capabilities: list[str] = []
292
+
293
+ for capability in CAPABILITIES:
294
+ content_dir = RAW_DIR / capability
295
+ navigation_dir = NAVIGATION_DIR / capability
296
+ on_disk_paths = {p.stem: p for p in content_dir.glob("*.md")}
297
+ on_disk_paths.update({p.stem: p for p in navigation_dir.glob("*.md")})
298
+ if not on_disk_paths:
299
+ continue
300
+
301
+ # Only files untouched by this run's outcomes are orphans.
302
+ current_slugs = {o.slug for o in stats.outcomes[capability]}
303
+ orphans = set(on_disk_paths) - current_slugs
304
+
305
+ previous_count = len(on_disk_paths)
306
+ new_count = len(current_slugs)
307
+ if previous_count > 0 and new_count < QUARANTINE_SAFETY_RATIO * previous_count:
308
+ skipped_capabilities.append(capability)
309
+ continue
310
+
311
+ if not orphans:
312
+ continue
313
+
314
+ removed_dir = REMOVED_DIR / capability
315
+ removed_dir.mkdir(parents=True, exist_ok=True)
316
+ removed_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
317
+ for slug in sorted(orphans):
318
+ src = on_disk_paths[slug]
319
+ url = read_url_from_file(src)
320
+ if url is None or not is_confirmed_gone(url):
321
+ still_alive[capability].append(slug)
322
+ continue
323
+
324
+ dst = removed_dir / f"{slug}.md"
325
+ shutil.move(str(src), str(dst))
326
+ quarantined[capability].append(slug)
327
+ with REMOVED_LOG.open("a", encoding="utf-8") as log:
328
+ log.write(json.dumps({"capability": capability, "slug": slug, "url": url, "removed_at": removed_at}) + "\n")
329
+
330
+ return {"quarantined": quarantined, "still_alive": still_alive, "skipped_capabilities": skipped_capabilities}
331
+
332
+
333
+ def write_filtered_reports(stats: RunStats) -> None:
334
+ FILTERED_DIR.mkdir(parents=True, exist_ok=True)
335
+
336
+ for capability in CAPABILITIES:
337
+ urls = sorted(o.url for o in stats.outcomes[capability])
338
+ (FILTERED_DIR / f"{capability}_urls.txt").write_text("\n".join(urls) + "\n", encoding="utf-8")
339
+
340
+ pruned_lines = [f"{url}\t# {reason}" for url, reason in sorted(stats.pruned)]
341
+ (FILTERED_DIR / "self-hosted_pruned.txt").write_text(
342
+ "\n".join(pruned_lines) + ("\n" if pruned_lines else ""), encoding="utf-8",
343
+ )
344
+
345
+ navigation_urls = sorted(
346
+ o.url for outcomes in stats.outcomes.values() for o in outcomes if o.is_navigation
347
+ )
348
+ (FILTERED_DIR / "navigation_urls.txt").write_text(
349
+ "\n".join(navigation_urls) + ("\n" if navigation_urls else ""), encoding="utf-8",
350
+ )
351
+
352
+ prune_counts = {label: 0 for label, _ in SELF_HOSTED_PRUNE_RULES}
353
+ for _, reason in stats.pruned:
354
+ prune_counts[reason] += 1
355
+
356
+ summary = {
357
+ "capabilities": {
358
+ name: {
359
+ "unique_urls": len(stats.outcomes[name]),
360
+ "navigation_pages": sum(1 for o in stats.outcomes[name] if o.is_navigation),
361
+ } for name in CAPABILITIES
362
+ },
363
+ "self_hosted_pruned": {"total": len(stats.pruned), "by_rule": prune_counts},
364
+ "total_in_scope": sum(len(stats.outcomes[name]) for name in CAPABILITIES),
365
+ "total_navigation_pages": len(navigation_urls),
366
+ "discovered_total": stats.discovered_total,
367
+ "fetch_failed_count": len(stats.fetch_failed),
368
+ "unmatched_count": len(stats.unmatched),
369
+ }
370
+ (FILTERED_DIR / "summary.json").write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8")
371
+
372
+
373
+ def print_summary(stats: RunStats, quarantine_result: dict) -> None:
374
+ print(f"\nTotal descubierto bajo /w/dxp: {stats.discovered_total}")
375
+ if stats.fetch_failed:
376
+ print(f"Fallos de fetch: {len(stats.fetch_failed)}")
377
+ for url in stats.fetch_failed:
378
+ print(f" - {url}")
379
+
380
+ print("\nPor capability (nuevas / actualizadas / sin cambios / navegación):")
381
+ total_in_scope = 0
382
+ total_navigation = 0
383
+ for capability, outcomes in stats.outcomes.items():
384
+ new = sum(1 for o in outcomes if o.status == "new")
385
+ updated = sum(1 for o in outcomes if o.status == "updated")
386
+ unchanged = sum(1 for o in outcomes if o.status == "unchanged")
387
+ navigation = sum(1 for o in outcomes if o.is_navigation)
388
+ total_in_scope += len(outcomes)
389
+ total_navigation += navigation
390
+ print(f" {capability:12s}: {len(outcomes):4d} total "
391
+ f"({new} nuevas, {updated} actualizadas, {unchanged} sin cambios, {navigation} navegación)")
392
+ print(f"\nTotal en scope: {total_in_scope} ({total_navigation} en raw/_navigation/, "
393
+ f"{total_in_scope - total_navigation} en raw/{{capability}}/)")
394
+
395
+ print(f"\nSelf-hosted podadas: {len(stats.pruned)}")
396
+
397
+ quarantined = quarantine_result["quarantined"]
398
+ total_quarantined = sum(len(v) for v in quarantined.values())
399
+ print(f"\nEn cuarentena (URL verificada como caída, HTTP 404/410): {total_quarantined}")
400
+ for capability, slugs in quarantined.items():
401
+ if slugs:
402
+ print(f" {capability}: {len(slugs)}")
403
+ for slug in slugs:
404
+ print(f" - {slug}")
405
+
406
+ still_alive = quarantine_result["still_alive"]
407
+ total_still_alive = sum(len(v) for v in still_alive.values())
408
+ if total_still_alive:
409
+ print(f"\nNo redescubiertas por el BFS pero SIGUEN VIVAS (no se tocaron, revisar cobertura del crawl): {total_still_alive}")
410
+ for capability, slugs in still_alive.items():
411
+ if slugs:
412
+ print(f" {capability}: {len(slugs)}")
413
+ for slug in slugs:
414
+ print(f" - {slug}")
415
+
416
+ if quarantine_result["skipped_capabilities"]:
417
+ print("\nADVERTENCIA: cuarentena omitida por caída sospechosa de conteo "
418
+ "(posible crawl incompleto), revisar a mano:")
419
+ for capability in quarantine_result["skipped_capabilities"]:
420
+ print(f" - {capability}")
421
+
422
+ if stats.unmatched:
423
+ print(f"\nURLs raras (ni en scope ni en descartadas conocidas), {len(stats.unmatched)}:")
424
+ for url in stats.unmatched:
425
+ print(f" {url}")
426
+
427
+
428
+ def main() -> None:
429
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
430
+ parser.add_argument("--max-depth", type=int, default=DEFAULT_MAX_DEPTH)
431
+ parser.add_argument("--max-pages", type=int, default=DEFAULT_MAX_PAGES)
432
+ parser.add_argument("--skip-regression-check", action="store_true",
433
+ help="Skip the post-run check_regressions.run_check() call (e.g. no git repo yet).")
434
+ args = parser.parse_args()
435
+
436
+ stats = asyncio.run(run_crawl(args.max_depth, args.max_pages))
437
+ quarantine_result = quarantine_orphans(stats)
438
+ write_filtered_reports(stats)
439
+ print_summary(stats, quarantine_result)
440
+
441
+ suspicious = False
442
+ if not args.skip_regression_check:
443
+ print("\n--- Verificación de regresiones (contra el último commit) ---")
444
+ suspicious = run_regression_check()
445
+
446
+ if stats.fetch_failed or suspicious:
447
+ sys.exit(1)
448
+
449
+
450
+ if __name__ == "__main__":
451
+ main()
@@ -0,0 +1,128 @@
1
+ Metadata-Version: 2.4
2
+ Name: liferay-docs-scraper
3
+ Version: 0.1.0
4
+ Summary: Scrape learn.liferay.com/w/dxp into a local Markdown corpus (raw/{capability}/*.md) for the liferay-expert Claude Code skill.
5
+ Requires-Python: <3.14,>=3.10
6
+ Requires-Dist: crawl4ai>=0.9.0
7
+ Description-Content-Type: text/markdown
8
+
9
+ # liferay-docs-scraper
10
+
11
+ Scrapes `learn.liferay.com/w/dxp/*` into a local, clean Markdown corpus
12
+ (`raw/{capability}/*.md`) and ships a Claude Code skill (`liferay-expert`)
13
+ that answers Liferay DXP questions by searching and citing that corpus.
14
+
15
+ **This repo does not ship Liferay's documentation.** It ships the code that
16
+ scrapes it, and a skill that reads whatever you scrape locally. Each user
17
+ builds and refreshes their own copy directly from learn.liferay.com.
18
+
19
+ ## Quickstart
20
+
21
+ The recommended order for a first-time setup: scrape, then install the
22
+ skill, then ask questions.
23
+
24
+ **1. Build the corpus (one-time, ~30-40 min):**
25
+
26
+ ```bash
27
+ uvx --from crawl4ai crawl4ai-setup # one-time, installs Playwright browsers
28
+ uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
29
+ ```
30
+
31
+ Run this from anywhere -- it does not write into your current directory,
32
+ see "Reference: the scraper in detail" below for exactly where it goes.
33
+
34
+ **2. Install the skill into whatever project you're working in:**
35
+
36
+ ```bash
37
+ npx skills add mordonez/liferay-docs-scraper --skill liferay-expert -a claude-code
38
+ ```
39
+
40
+ You'll see:
41
+
42
+ ```
43
+ ◇ Installed 1 skill ───────────────────╮
44
+ │ │
45
+ │ ✓ liferay-expert (copied) │
46
+ │ → ./.claude/skills/liferay-expert │
47
+ │ │
48
+ ├───────────────────────────────────────╯
49
+ ```
50
+
51
+ **3. Ask Claude Code a Liferay question**, e.g. "how do I configure a
52
+ synonym set in Liferay search?" The skill finds the corpus, greps the
53
+ `search` capability, reads `search-administration-and-tuning-synonym-sets.md`,
54
+ and answers grounded in that page -- citing
55
+ `https://learn.liferay.com/w/dxp/search/search-administration-and-tuning/synonym-sets`
56
+ as the source.
57
+
58
+ The corpus is shared across every project where you install the skill (see
59
+ "OS default location" below), so step 1 is only ever needed once per
60
+ machine -- rerun it later just to refresh, not per-project.
61
+
62
+ **If you install the skill without doing step 1 first** (or its corpus goes
63
+ stale), it notices and tells you what to run rather than guessing or
64
+ answering ungrounded -- it never launches the ~30-40 min scrape on its own
65
+ mid-conversation. See "Step 1/2" in `skills/liferay-expert/SKILL.md` for
66
+ that check.
67
+
68
+ ## Reference: the scraper in detail
69
+
70
+ Requires Python 3.10-3.13 (crawl4ai's Playwright dependency doesn't yet
71
+ support 3.14) and [uv](https://docs.astral.sh/uv/).
72
+
73
+ ```bash
74
+ # One-time: installs the Playwright/Chromium browser crawl4ai drives
75
+ uvx --from crawl4ai crawl4ai-setup
76
+
77
+ # From anywhere -- the corpus does NOT go in your current directory.
78
+ # Not on PyPI yet, so install straight from GitHub:
79
+ uvx --python 3.13 --from "git+https://github.com/mordonez/liferay-docs-scraper" liferay-docs-scraper
80
+ ```
81
+
82
+ This takes roughly 30-40 minutes (BFS deep crawl of ~1900 pages across 14
83
+ capabilities) and writes to one shared, per-user location (so it's the same
84
+ corpus no matter which project you're in when the skill looks for it):
85
+
86
+ | OS | Default location |
87
+ |---|---|
88
+ | macOS | `~/Library/Application Support/liferay-docs/` |
89
+ | Linux | `~/.local/share/liferay-docs/` (or `$XDG_DATA_HOME/liferay-docs`) |
90
+ | Windows | `%LOCALAPPDATA%\liferay-docs\` |
91
+
92
+ Set `LIFERAY_DOCS_DIR` to override (e.g. to keep a project-local copy instead).
93
+
94
+ Inside that directory:
95
+
96
+ - `raw/{capability}/*.md` — the corpus, one file per page
97
+ - `raw/_navigation/{capability}/*.md` — pure TOC pages, kept but deprioritized
98
+ - `raw/_removed/{capability}/*.md` — pages confirmed gone from the live site
99
+ - `reports/filtered/` — URL manifests, self-hosted prune log, run summary
100
+
101
+ Re-run it anytime (weekly recommended) to refresh: it starts from zero every
102
+ time, so it naturally picks up new pages, updates changed ones, and
103
+ quarantines (never deletes) removed ones. If that directory is (or becomes)
104
+ a git repo -- worth doing once, purely as a local diffing tool, nothing needs
105
+ pushing anywhere -- it also runs `check-regressions` automatically afterward
106
+ and flags any file that shrank by more than half or grew more than 3x versus
107
+ the last commit (signals of a broken fetch); see
108
+ `docs/adr/0001-crawl4ai-based-corpus-pipeline.md` for why that check exists.
109
+
110
+ ## Reference: the skill in detail
111
+
112
+ ```bash
113
+ npx skills add mordonez/liferay-docs-scraper --skill liferay-expert
114
+ ```
115
+
116
+ Or just copy `skills/liferay-expert/SKILL.md` into `.claude/skills/liferay-expert/`
117
+ in any project. Claude Code picks it up automatically; the skill itself
118
+ resolves `$LIFERAY_DOCS_DIR` (or the OS default above) to find the corpus,
119
+ so it works the same regardless of which project you installed it into.
120
+
121
+ ## Why no bundled docs, no embeddings, no vector DB
122
+
123
+ See `docs/adr/` for the full reasoning. Short version: the corpus is
124
+ Liferay's copyrighted documentation text -- distributing the *tool* that
125
+ scrapes public pages is a different, much lower-risk thing than a third
126
+ party redistributing that text at scale. Plain grep + Read over ~1800
127
+ well-organized Markdown files is fast enough that no search index is needed;
128
+ add one later if that stops being true.
@@ -0,0 +1,9 @@
1
+ liferay_docs_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ liferay_docs_scraper/check_regressions.py,sha256=_7k-5wNuxrhRJTbggUufwhcH8ECXByjUUXm_GELwHQ8,4114
3
+ liferay_docs_scraper/classify_pages.py,sha256=yXo6eKLlsFpoM1CKzcZUNEWBN8RcQ1Nc_2qNgqnkLLs,1448
4
+ liferay_docs_scraper/filter_urls.py,sha256=kHrskK2xyPO9tKCrjJUepLEN5k5xunG9I6iFg2J3fxU,5615
5
+ liferay_docs_scraper/pipeline.py,sha256=pnivtmSXP7oGCjV7Ta52ZEjd1655tJMoveTKU7GcWqQ,19959
6
+ liferay_docs_scraper-0.1.0.dist-info/METADATA,sha256=fSSrsFCFYg_Gbs8GEXcJfxR3kZaWDyqM0AFKUEDpzOc,5522
7
+ liferay_docs_scraper-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
8
+ liferay_docs_scraper-0.1.0.dist-info/entry_points.txt,sha256=TFz1Q5Aj4QrlDOMZFWCHCN4qKDTqEG985t0JhoLYJVU,140
9
+ liferay_docs_scraper-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ check-regressions = liferay_docs_scraper.check_regressions:main
3
+ liferay-docs-scraper = liferay_docs_scraper.pipeline:main