doculayer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doculayer/__init__.py +142 -0
- doculayer/cli.py +132 -0
- doculayer/config.py +31 -0
- doculayer/core/__init__.py +1 -0
- doculayer/core/cache.py +73 -0
- doculayer/core/citation.py +48 -0
- doculayer/core/fetcher.py +102 -0
- doculayer/core/parser.py +156 -0
- doculayer/core/search.py +58 -0
- doculayer/mcp_server.py +262 -0
- doculayer/setup_wizard.py +293 -0
- doculayer/sources/__init__.py +1 -0
- doculayer/sources/base.py +33 -0
- doculayer/sources/generic.py +144 -0
- doculayer/sources/llms_txt.py +109 -0
- doculayer/sources/registry.py +131 -0
- doculayer-0.1.0.dist-info/METADATA +19 -0
- doculayer-0.1.0.dist-info/RECORD +20 -0
- doculayer-0.1.0.dist-info/WHEEL +4 -0
- doculayer-0.1.0.dist-info/entry_points.txt +3 -0
doculayer/__init__.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""DocuLayer: live documentation access layer for AI agents.
|
|
2
|
+
|
|
3
|
+
Zero hallucination — content is fetched verbatim from source, never generated.
|
|
4
|
+
No disk storage — ephemeral in-memory TTL cache only.
|
|
5
|
+
Quick access — llms.txt-first with BM25 section search.
|
|
6
|
+
|
|
7
|
+
Quick start
|
|
8
|
+
-----------
|
|
9
|
+
import asyncio
|
|
10
|
+
from doculayer import search, fetch
|
|
11
|
+
|
|
12
|
+
results = asyncio.run(search("dependency injection", "fastapi"))
|
|
13
|
+
for r in results:
|
|
14
|
+
print(r.section.source_url, r.score)
|
|
15
|
+
print(r.section.trimmed(200))
|
|
16
|
+
|
|
17
|
+
content = asyncio.run(fetch("httpx", section="AsyncClient"))
|
|
18
|
+
print(content)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from .config import DocuLayerConfig, get_config
|
|
24
|
+
from .core.citation import attr_header
|
|
25
|
+
from .core.fetcher import DocFetcher, FetchResult
|
|
26
|
+
from .core.parser import DocParser, DocSection
|
|
27
|
+
from .core.search import DocSearcher, SearchResult
|
|
28
|
+
from .sources.generic import GenericDocSource
|
|
29
|
+
from .sources.registry import resolve_identifier
|
|
30
|
+
|
|
31
|
+
__version__ = "0.1.0"
|
|
32
|
+
__all__ = [
|
|
33
|
+
"DocuLayerConfig",
|
|
34
|
+
"get_config",
|
|
35
|
+
"DocFetcher",
|
|
36
|
+
"FetchResult",
|
|
37
|
+
"DocParser",
|
|
38
|
+
"DocSection",
|
|
39
|
+
"DocSearcher",
|
|
40
|
+
"SearchResult",
|
|
41
|
+
"GenericDocSource",
|
|
42
|
+
"resolve_identifier",
|
|
43
|
+
"search",
|
|
44
|
+
"fetch",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# ── module-level singleton fetcher ────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
_fetcher: DocFetcher | None = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_fetcher() -> DocFetcher:
|
|
53
|
+
global _fetcher
|
|
54
|
+
if _fetcher is None:
|
|
55
|
+
cfg = get_config()
|
|
56
|
+
_fetcher = DocFetcher(
|
|
57
|
+
ttl=cfg.cache_ttl,
|
|
58
|
+
max_size=cfg.max_cache_size,
|
|
59
|
+
timeout=cfg.fetch_timeout,
|
|
60
|
+
max_bytes=cfg.max_fetch_bytes,
|
|
61
|
+
)
|
|
62
|
+
return _fetcher
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ── high-level async API ──────────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
async def search(
|
|
68
|
+
query: str,
|
|
69
|
+
identifier: str,
|
|
70
|
+
top_k: int = 5,
|
|
71
|
+
) -> list[SearchResult]:
|
|
72
|
+
"""Search live documentation for *query*.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
query: What to find — e.g. ``"dependency injection"``.
|
|
76
|
+
identifier: Package name or URL — ``"fastapi"``, ``"pypi:httpx"``,
|
|
77
|
+
``"npm:react"``, ``"https://docs.example.com"``.
|
|
78
|
+
top_k: Maximum number of sections to return.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
List of :class:`SearchResult` objects. Every section is verbatim
|
|
82
|
+
text from the source — ``section.source_url`` identifies the origin.
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
ValueError: if *identifier* cannot be resolved to a URL.
|
|
86
|
+
"""
|
|
87
|
+
url = await resolve_identifier(identifier)
|
|
88
|
+
if not url:
|
|
89
|
+
raise ValueError(f"Cannot resolve documentation source: {identifier!r}")
|
|
90
|
+
return await GenericDocSource(url, _get_fetcher(), label=identifier).search(
|
|
91
|
+
query, top_k
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
async def fetch(
|
|
96
|
+
identifier: str,
|
|
97
|
+
section: str | None = None,
|
|
98
|
+
) -> str:
|
|
99
|
+
"""Fetch documentation for *identifier*.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
identifier: Package name or URL.
|
|
103
|
+
section: Optional heading to extract — e.g. ``"Installation"``.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Markdown string with a source attribution header.
|
|
107
|
+
Content is verbatim from the fetched page.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
ValueError: if *identifier* cannot be resolved.
|
|
111
|
+
"""
|
|
112
|
+
url = await resolve_identifier(identifier)
|
|
113
|
+
if not url:
|
|
114
|
+
raise ValueError(f"Cannot resolve documentation source: {identifier!r}")
|
|
115
|
+
|
|
116
|
+
src = GenericDocSource(url, _get_fetcher(), label=identifier)
|
|
117
|
+
cfg = get_config()
|
|
118
|
+
|
|
119
|
+
if section:
|
|
120
|
+
s = await src.fetch_section(section)
|
|
121
|
+
if s:
|
|
122
|
+
return s.cited_content
|
|
123
|
+
return f"Section '{section}' not found in {identifier} docs."
|
|
124
|
+
|
|
125
|
+
sections = await src.fetch_page()
|
|
126
|
+
if not sections:
|
|
127
|
+
return f"No content found at {url}"
|
|
128
|
+
|
|
129
|
+
s0 = sections[0]
|
|
130
|
+
header = attr_header(s0.source_url, s0.fetched_at, s0.from_cache) + "\n\n"
|
|
131
|
+
limit = cfg.max_section_words * 4
|
|
132
|
+
parts: list[str] = []
|
|
133
|
+
total = 0
|
|
134
|
+
for s in sections:
|
|
135
|
+
wc = len(s.content.split())
|
|
136
|
+
if total + wc > limit:
|
|
137
|
+
parts.append("*(truncated — use `section=` for targeted access)*")
|
|
138
|
+
break
|
|
139
|
+
parts.append(s.content)
|
|
140
|
+
total += wc
|
|
141
|
+
|
|
142
|
+
return header + "\n\n".join(parts)
|
doculayer/cli.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""DocuLayer CLI.
|
|
2
|
+
|
|
3
|
+
Usage
|
|
4
|
+
-----
|
|
5
|
+
doculayer setup # auto-detect IDE and install MCP server
|
|
6
|
+
doculayer search "dependency injection" --source fastapi
|
|
7
|
+
doculayer fetch httpx --section AsyncClient
|
|
8
|
+
doculayer resolve pydantic
|
|
9
|
+
doculayer sources
|
|
10
|
+
doculayer mcp # start MCP server on stdio
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import sys
|
|
17
|
+
|
|
18
|
+
import click
|
|
19
|
+
|
|
20
|
+
from . import fetch as _fetch
|
|
21
|
+
from . import search as _search
|
|
22
|
+
from .setup_wizard import setup as _setup_cmd
|
|
23
|
+
from .sources.registry import resolve_identifier
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@click.group()
|
|
27
|
+
@click.version_option(package_name="doculayer")
|
|
28
|
+
def main() -> None:
|
|
29
|
+
"""DocuLayer: live documentation access for AI agents."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
main.add_command(_setup_cmd)
|
|
34
|
+
|
|
35
|
+
# ── search ─────────────────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
@main.command()
|
|
38
|
+
@click.argument("query")
|
|
39
|
+
@click.option("--source", "-s", required=True, help="Package name or URL to search")
|
|
40
|
+
@click.option("--results", "-n", default=3, show_default=True, help="Max results")
|
|
41
|
+
def search(query: str, source: str, results: int) -> None:
|
|
42
|
+
"""Search QUERY in live documentation for SOURCE."""
|
|
43
|
+
|
|
44
|
+
async def _run() -> None:
|
|
45
|
+
try:
|
|
46
|
+
hits = await _search(query, source, top_k=results)
|
|
47
|
+
except ValueError as e:
|
|
48
|
+
click.echo(str(e), err=True)
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
|
|
51
|
+
if not hits:
|
|
52
|
+
click.echo(f"No results for '{query}' in {source}.", err=True)
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
for i, r in enumerate(hits, 1):
|
|
56
|
+
click.echo(f"\n{'─' * 60}")
|
|
57
|
+
click.echo(f" {i}. {r.section.title} (score {r.score:.2f})")
|
|
58
|
+
click.echo(f" {r.section.source_url}")
|
|
59
|
+
click.echo(f"{'─' * 60}")
|
|
60
|
+
click.echo(r.section.trimmed(200))
|
|
61
|
+
|
|
62
|
+
asyncio.run(_run())
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ── fetch ──────────────────────────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
@main.command()
|
|
68
|
+
@click.argument("identifier")
|
|
69
|
+
@click.option("--section", "-S", default=None, help="Section heading to extract")
|
|
70
|
+
def fetch(identifier: str, section: str | None) -> None:
|
|
71
|
+
"""Fetch documentation for IDENTIFIER."""
|
|
72
|
+
|
|
73
|
+
async def _run() -> None:
|
|
74
|
+
try:
|
|
75
|
+
content = await _fetch(identifier, section=section)
|
|
76
|
+
except ValueError as e:
|
|
77
|
+
click.echo(str(e), err=True)
|
|
78
|
+
sys.exit(1)
|
|
79
|
+
click.echo(content)
|
|
80
|
+
|
|
81
|
+
asyncio.run(_run())
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ── resolve ────────────────────────────────────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
@main.command()
|
|
87
|
+
@click.argument("identifier")
|
|
88
|
+
def resolve(identifier: str) -> None:
|
|
89
|
+
"""Print the resolved documentation URL for IDENTIFIER."""
|
|
90
|
+
|
|
91
|
+
async def _run() -> None:
|
|
92
|
+
url = await resolve_identifier(identifier)
|
|
93
|
+
if url:
|
|
94
|
+
click.echo(url)
|
|
95
|
+
else:
|
|
96
|
+
click.echo(f"Cannot resolve: {identifier}", err=True)
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
|
|
99
|
+
asyncio.run(_run())
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ── sources ────────────────────────────────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
@main.command()
|
|
105
|
+
def sources() -> None:
|
|
106
|
+
"""List packages with built-in llms.txt support."""
|
|
107
|
+
from .sources.registry import KNOWN_LLMS_TXT, KNOWN_URLS
|
|
108
|
+
|
|
109
|
+
click.echo("Packages with llms.txt support (fast targeted access):")
|
|
110
|
+
for name in sorted(KNOWN_LLMS_TXT):
|
|
111
|
+
click.echo(f" {name}")
|
|
112
|
+
if KNOWN_URLS:
|
|
113
|
+
click.echo()
|
|
114
|
+
click.echo("Packages with direct URL shortcuts:")
|
|
115
|
+
for name, url in sorted(KNOWN_URLS.items()):
|
|
116
|
+
click.echo(f" {name:<16} {url}")
|
|
117
|
+
click.echo()
|
|
118
|
+
click.echo("Identifier formats: bare-name · pypi: · npm: · gh: · https://")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# ── mcp ────────────────────────────────────────────────────────────────────────
|
|
122
|
+
|
|
123
|
+
@main.command("mcp")
|
|
124
|
+
def mcp_serve() -> None:
|
|
125
|
+
"""Start DocuLayer as an MCP server (stdio transport)."""
|
|
126
|
+
from .mcp_server import main as _mcp_main
|
|
127
|
+
|
|
128
|
+
_mcp_main()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
main()
|
doculayer/config.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""DocuLayer configuration.
|
|
2
|
+
|
|
3
|
+
All settings read from environment variables with sane defaults.
|
|
4
|
+
No config files — no disk reads.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DocuLayerConfig:
|
|
15
|
+
# ── cache (in-memory only; no disk persistence) ──────────────────────────
|
|
16
|
+
cache_ttl: int = int(os.getenv("DOCULAYER_CACHE_TTL", "3600")) # seconds
|
|
17
|
+
max_cache_size: int = int(os.getenv("DOCULAYER_MAX_CACHE", "256")) # entries
|
|
18
|
+
|
|
19
|
+
# ── HTTP fetch ────────────────────────────────────────────────────────────
|
|
20
|
+
fetch_timeout: float = float(os.getenv("DOCULAYER_FETCH_TIMEOUT", "12.0"))
|
|
21
|
+
max_fetch_bytes: int = int(os.getenv("DOCULAYER_MAX_BYTES", str(512 * 1024)))
|
|
22
|
+
|
|
23
|
+
# ── output shaping ────────────────────────────────────────────────────────
|
|
24
|
+
max_section_words: int = int(os.getenv("DOCULAYER_MAX_WORDS", "400"))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
_default = DocuLayerConfig()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_config() -> DocuLayerConfig:
|
|
31
|
+
return _default
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# doculayer core
|
doculayer/core/cache.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Process-local TTL cache.
|
|
2
|
+
|
|
3
|
+
Zero disk I/O by design. Cache entries evaporate when the process exits.
|
|
4
|
+
This is the storage guarantee DocuLayer makes: no docs are persisted.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import time
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Generic, Optional, TypeVar
|
|
12
|
+
|
|
13
|
+
T = TypeVar("T")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class _Entry(Generic[T]):
|
|
18
|
+
value: T
|
|
19
|
+
expires_at: float # monotonic clock
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def alive(self) -> bool:
|
|
23
|
+
return time.monotonic() < self.expires_at
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TTLCache(Generic[T]):
|
|
27
|
+
"""In-memory TTL cache.
|
|
28
|
+
|
|
29
|
+
Eviction policy: expired-first, then earliest-expiring on overflow.
|
|
30
|
+
Thread safety: single-threaded (asyncio); no locks needed.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, ttl: int, max_size: int = 256) -> None:
|
|
34
|
+
self._ttl = ttl
|
|
35
|
+
self._max = max_size
|
|
36
|
+
self._store: dict[str, _Entry[T]] = {}
|
|
37
|
+
|
|
38
|
+
# ── public ────────────────────────────────────────────────────────────────
|
|
39
|
+
|
|
40
|
+
def get(self, key: str) -> Optional[T]:
|
|
41
|
+
entry = self._store.get(key)
|
|
42
|
+
if entry is None:
|
|
43
|
+
return None
|
|
44
|
+
if not entry.alive:
|
|
45
|
+
del self._store[key]
|
|
46
|
+
return None
|
|
47
|
+
return entry.value
|
|
48
|
+
|
|
49
|
+
def set(self, key: str, value: T) -> None:
|
|
50
|
+
if key not in self._store and len(self._store) >= self._max:
|
|
51
|
+
self._evict()
|
|
52
|
+
self._store[key] = _Entry(value=value, expires_at=time.monotonic() + self._ttl)
|
|
53
|
+
|
|
54
|
+
def invalidate(self, key: str) -> None:
|
|
55
|
+
self._store.pop(key, None)
|
|
56
|
+
|
|
57
|
+
def clear(self) -> None:
|
|
58
|
+
self._store.clear()
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def stats(self) -> dict[str, int]:
|
|
62
|
+
live = sum(1 for v in self._store.values() if v.alive)
|
|
63
|
+
return {"alive": live, "total": len(self._store)}
|
|
64
|
+
|
|
65
|
+
# ── private ───────────────────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
def _evict(self) -> None:
|
|
68
|
+
dead = [k for k, v in self._store.items() if not v.alive]
|
|
69
|
+
for k in dead:
|
|
70
|
+
del self._store[k]
|
|
71
|
+
if len(self._store) >= self._max:
|
|
72
|
+
victim = min(self._store, key=lambda k: self._store[k].expires_at)
|
|
73
|
+
del self._store[victim]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Shared citation formatting for DocuLayer.
|
|
2
|
+
|
|
3
|
+
Every piece of documentation returned to a caller MUST include provenance.
|
|
4
|
+
This module provides a single source of truth for that formatting so no
|
|
5
|
+
output path can silently omit a source attribution.
|
|
6
|
+
|
|
7
|
+
Design guarantee
|
|
8
|
+
----------------
|
|
9
|
+
``attr_header`` and ``cited_block`` are the only functions that produce
|
|
10
|
+
attribution strings. Import them here — never duplicate the format.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def attr_header(url: str, fetched_at: float, from_cache: bool) -> str:
|
|
19
|
+
"""Return a Markdown blockquote attribution line.
|
|
20
|
+
|
|
21
|
+
Example output::
|
|
22
|
+
|
|
23
|
+
> **Source**: https://fastapi.tiangolo.com/tutorial/ \\
|
|
24
|
+
> **Fetched**: 42s ago *(cached)*
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
url: The canonical URL the content was fetched from.
|
|
28
|
+
fetched_at: Unix timestamp of the fetch (``time.time()``).
|
|
29
|
+
from_cache: Whether the result was served from the in-memory cache.
|
|
30
|
+
"""
|
|
31
|
+
age = time.time() - fetched_at
|
|
32
|
+
if age < 60:
|
|
33
|
+
age_s = f"{int(age)}s"
|
|
34
|
+
elif age < 3600:
|
|
35
|
+
age_s = f"{int(age / 60)}m"
|
|
36
|
+
else:
|
|
37
|
+
age_s = f"{int(age / 3600)}h"
|
|
38
|
+
note = " *(cached)*" if from_cache else ""
|
|
39
|
+
return f"> **Source**: {url} \n> **Fetched**: {age_s} ago{note}"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def cited_block(content: str, url: str, fetched_at: float, from_cache: bool) -> str:
|
|
43
|
+
"""Return *content* preceded by a citation header.
|
|
44
|
+
|
|
45
|
+
Use this whenever a whole section of documentation is returned so the
|
|
46
|
+
caller always sees provenance before the content.
|
|
47
|
+
"""
|
|
48
|
+
return f"{attr_header(url, fetched_at, from_cache)}\n\n{content}"
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Async HTTP fetcher backed by TTLCache.
|
|
2
|
+
|
|
3
|
+
Guarantees:
|
|
4
|
+
- No file-system writes.
|
|
5
|
+
- Stale content is never served past the TTL.
|
|
6
|
+
- Hard byte cap prevents oversized responses from swamping memory.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import time
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
from .cache import TTLCache
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class FetchResult:
|
|
22
|
+
url: str # final URL after redirects
|
|
23
|
+
content: str
|
|
24
|
+
content_type: str
|
|
25
|
+
fetched_at: float # Unix timestamp
|
|
26
|
+
from_cache: bool
|
|
27
|
+
|
|
28
|
+
def age_label(self) -> str:
|
|
29
|
+
secs = time.time() - self.fetched_at
|
|
30
|
+
if secs < 60:
|
|
31
|
+
return f"{int(secs)}s"
|
|
32
|
+
if secs < 3600:
|
|
33
|
+
return f"{int(secs / 60)}m"
|
|
34
|
+
return f"{int(secs / 3600)}h"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DocFetcher:
|
|
38
|
+
"""Fetch documentation pages with TTL in-memory caching."""
|
|
39
|
+
|
|
40
|
+
_HEADERS = {
|
|
41
|
+
"User-Agent": "DocuLayer/1.0 (live-doc-access-layer; no-hallucination)",
|
|
42
|
+
"Accept": "text/html,application/xhtml+xml,text/plain;q=0.9,*/*;q=0.8",
|
|
43
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
ttl: int = 3600,
|
|
49
|
+
max_size: int = 256,
|
|
50
|
+
timeout: float = 12.0,
|
|
51
|
+
max_bytes: int = 524_288,
|
|
52
|
+
) -> None:
|
|
53
|
+
self._cache: TTLCache[FetchResult] = TTLCache(ttl, max_size)
|
|
54
|
+
self._ttl = ttl
|
|
55
|
+
self._timeout = timeout
|
|
56
|
+
self._max_bytes = max_bytes
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _key(url: str) -> str:
|
|
60
|
+
return hashlib.sha256(url.encode()).hexdigest()[:20]
|
|
61
|
+
|
|
62
|
+
async def fetch(self, url: str, force: bool = False) -> FetchResult:
|
|
63
|
+
"""Fetch *url*, returning a cached result when still live."""
|
|
64
|
+
key = self._key(url)
|
|
65
|
+
|
|
66
|
+
if not force:
|
|
67
|
+
hit = self._cache.get(key)
|
|
68
|
+
if hit is not None:
|
|
69
|
+
return FetchResult(
|
|
70
|
+
url=hit.url,
|
|
71
|
+
content=hit.content,
|
|
72
|
+
content_type=hit.content_type,
|
|
73
|
+
fetched_at=hit.fetched_at,
|
|
74
|
+
from_cache=True,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
async with httpx.AsyncClient(
|
|
78
|
+
follow_redirects=True, timeout=self._timeout
|
|
79
|
+
) as client:
|
|
80
|
+
resp = await client.get(url, headers=self._HEADERS)
|
|
81
|
+
resp.raise_for_status()
|
|
82
|
+
|
|
83
|
+
content = resp.text[: self._max_bytes]
|
|
84
|
+
ct = resp.headers.get("content-type", "text/html")
|
|
85
|
+
|
|
86
|
+
result = FetchResult(
|
|
87
|
+
url=str(resp.url),
|
|
88
|
+
content=content,
|
|
89
|
+
content_type=ct,
|
|
90
|
+
fetched_at=time.time(),
|
|
91
|
+
from_cache=False,
|
|
92
|
+
)
|
|
93
|
+
self._cache.set(key, result)
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def cache_stats(self) -> dict[str, int]:
|
|
98
|
+
return self._cache.stats
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def ttl(self) -> int:
|
|
102
|
+
return self._ttl
|
doculayer/core/parser.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Parse HTML or Markdown documentation into a flat list of DocSection objects.
|
|
2
|
+
|
|
3
|
+
Sections map 1-to-1 with headings. The parser never generates text —
|
|
4
|
+
every word in a DocSection came verbatim from the fetched source.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
import markdownify
|
|
15
|
+
from bs4 import BeautifulSoup
|
|
16
|
+
|
|
17
|
+
from .citation import attr_header
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class DocSection:
|
|
22
|
+
title: str
|
|
23
|
+
content: str # Markdown, verbatim from source
|
|
24
|
+
level: int # heading depth: 0 = pre-heading preamble, 1-6 = h1-h6
|
|
25
|
+
anchor: Optional[str]
|
|
26
|
+
source_url: str
|
|
27
|
+
fetched_at: float = field(default_factory=time.time)
|
|
28
|
+
from_cache: bool = False
|
|
29
|
+
|
|
30
|
+
def trimmed(self, max_words: int) -> str:
|
|
31
|
+
"""Return content, hard-truncated to *max_words*."""
|
|
32
|
+
words = self.content.split()
|
|
33
|
+
if len(words) <= max_words:
|
|
34
|
+
return self.content
|
|
35
|
+
return " ".join(words[:max_words]) + "\n\n*(content truncated)*"
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def cited_content(self) -> str:
|
|
39
|
+
"""Content with a Markdown blockquote citation prepended.
|
|
40
|
+
|
|
41
|
+
Use this property whenever returning documentation to a caller so
|
|
42
|
+
provenance is structurally inseparable from the text. Example::
|
|
43
|
+
|
|
44
|
+
> **Source**: https://fastapi.tiangolo.com/tutorial/
|
|
45
|
+
> **Fetched**: 12s ago
|
|
46
|
+
|
|
47
|
+
## Dependency Injection
|
|
48
|
+
...
|
|
49
|
+
"""
|
|
50
|
+
return f"{attr_header(self.source_url, self.fetched_at, self.from_cache)}\n\n{self.content}"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ── HTML noise selectors ──────────────────────────────────────────────────────
|
|
54
|
+
|
|
55
|
+
_NOISE_SEL = [
|
|
56
|
+
"nav", "header", "footer", "aside",
|
|
57
|
+
".sidebar", ".nav", ".navbar", ".header", ".footer",
|
|
58
|
+
".toc", ".table-of-contents", "#sidebar", ".breadcrumb",
|
|
59
|
+
".pagination", ".cookie-banner", ".advertisement",
|
|
60
|
+
"[role=navigation]", "[role=banner]", "[role=complementary]",
|
|
61
|
+
"script", "style", "noscript", "iframe",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
# Heading pattern: lines starting with 1-6 # characters
|
|
65
|
+
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
|
66
|
+
# Pandoc-style heading anchors: {#my-anchor}
|
|
67
|
+
_ANCHOR_RE = re.compile(r"\{#([^}]+)\}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class DocParser:
|
|
71
|
+
"""Stateless parser for HTML and Markdown documentation pages."""
|
|
72
|
+
|
|
73
|
+
def parse(
|
|
74
|
+
self,
|
|
75
|
+
content: str,
|
|
76
|
+
url: str,
|
|
77
|
+
content_type: str = "text/html",
|
|
78
|
+
fetched_at: float = 0.0,
|
|
79
|
+
from_cache: bool = False,
|
|
80
|
+
) -> list[DocSection]:
|
|
81
|
+
"""Parse *content* into sections.
|
|
82
|
+
|
|
83
|
+
HTML is stripped and converted to Markdown first.
|
|
84
|
+
Markdown is split at heading boundaries.
|
|
85
|
+
"""
|
|
86
|
+
ts = fetched_at or time.time()
|
|
87
|
+
if "html" in content_type.lower():
|
|
88
|
+
md = self._html_to_md(content)
|
|
89
|
+
else:
|
|
90
|
+
md = content
|
|
91
|
+
return self._split_md(md, url, ts, from_cache)
|
|
92
|
+
|
|
93
|
+
# ── private ───────────────────────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _html_to_md(html: str) -> str:
|
|
97
|
+
soup = BeautifulSoup(html, "lxml")
|
|
98
|
+
|
|
99
|
+
# Strip navigation / chrome
|
|
100
|
+
for sel in _NOISE_SEL:
|
|
101
|
+
for el in soup.select(sel):
|
|
102
|
+
el.decompose()
|
|
103
|
+
|
|
104
|
+
# Find main content region
|
|
105
|
+
main = (
|
|
106
|
+
soup.find("main")
|
|
107
|
+
or soup.find(id=re.compile(r"^(content|main|body)$", re.I))
|
|
108
|
+
or soup.find(
|
|
109
|
+
class_=re.compile(r"\b(content|main|body|article|post|docs)\b", re.I)
|
|
110
|
+
)
|
|
111
|
+
or soup.find("article")
|
|
112
|
+
or soup.body
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return markdownify.markdownify(
|
|
116
|
+
str(main or soup), heading_style="ATX", strip=["img"]
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def _split_md(
|
|
121
|
+
md: str, url: str, fetched_at: float, from_cache: bool
|
|
122
|
+
) -> list[DocSection]:
|
|
123
|
+
md = re.sub(r"\n{3,}", "\n\n", md.strip())
|
|
124
|
+
matches = list(_HEADING_RE.finditer(md))
|
|
125
|
+
sections: list[DocSection] = []
|
|
126
|
+
|
|
127
|
+
if not matches:
|
|
128
|
+
if md.strip():
|
|
129
|
+
sections.append(
|
|
130
|
+
DocSection("Documentation", md.strip(), 0, None, url, fetched_at, from_cache)
|
|
131
|
+
)
|
|
132
|
+
return sections
|
|
133
|
+
|
|
134
|
+
# Content that appears before the first heading
|
|
135
|
+
preamble = md[: matches[0].start()].strip()
|
|
136
|
+
if preamble:
|
|
137
|
+
sections.append(DocSection("Overview", preamble, 0, None, url, fetched_at, from_cache))
|
|
138
|
+
|
|
139
|
+
for i, m in enumerate(matches):
|
|
140
|
+
level = len(m.group(1))
|
|
141
|
+
raw_title = m.group(2).strip()
|
|
142
|
+
|
|
143
|
+
anchor_m = _ANCHOR_RE.search(raw_title)
|
|
144
|
+
anchor = anchor_m.group(1) if anchor_m else None
|
|
145
|
+
title = _ANCHOR_RE.sub("", raw_title).strip()
|
|
146
|
+
|
|
147
|
+
body_start = m.end()
|
|
148
|
+
body_end = matches[i + 1].start() if i + 1 < len(matches) else len(md)
|
|
149
|
+
body = md[body_start:body_end].strip()
|
|
150
|
+
|
|
151
|
+
full = f"{'#' * level} {title}\n\n{body}" if body else f"{'#' * level} {title}"
|
|
152
|
+
sections.append(
|
|
153
|
+
DocSection(title, full, level, anchor, url, fetched_at, from_cache)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return sections
|