docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""``search_semantic`` retrieval tool.
|
|
2
|
+
|
|
3
|
+
Spec: ``docs/specs/mcp-tools.md`` §4.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
from typing import Any, Literal
|
|
11
|
+
|
|
12
|
+
from cairn.core.errors import ToolError
|
|
13
|
+
from cairn.embed.base import Embedder
|
|
14
|
+
from cairn.tools.base import DocumentIndex, ToolResponse, estimate_tokens_of_payload
|
|
15
|
+
|
|
16
|
+
IncludeField = Literal["synopsis", "head", "evidence"]
|
|
17
|
+
|
|
18
|
+
_HEAD_CHARS: int = 200
|
|
19
|
+
_EVIDENCE_CHARS: int = 360
|
|
20
|
+
_MAX_EVIDENCE_TERMS: int = 40
|
|
21
|
+
_VALID_INCLUDE: frozenset[str] = frozenset({"synopsis", "head", "evidence"})
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def search_semantic(
|
|
25
|
+
index: DocumentIndex,
|
|
26
|
+
*,
|
|
27
|
+
embedder: Embedder,
|
|
28
|
+
query: str,
|
|
29
|
+
scope: str | None = None,
|
|
30
|
+
k: int = 8,
|
|
31
|
+
include: Sequence[IncludeField] = ("synopsis", "head", "evidence"),
|
|
32
|
+
) -> ToolResponse:
|
|
33
|
+
"""Dense vector search across the document.
|
|
34
|
+
|
|
35
|
+
The MCP server passes an :class:`Embedder` instance; tools receive it as
|
|
36
|
+
a typed dependency so they can be unit-tested with a fake.
|
|
37
|
+
"""
|
|
38
|
+
if k < 1 or k > 32:
|
|
39
|
+
msg = f"k must be in [1, 32]; got {k}"
|
|
40
|
+
raise ToolError(msg, details={"k": k})
|
|
41
|
+
if not query.strip():
|
|
42
|
+
msg = "query must not be empty"
|
|
43
|
+
raise ToolError(msg)
|
|
44
|
+
bad = [x for x in include if x not in _VALID_INCLUDE]
|
|
45
|
+
if bad:
|
|
46
|
+
msg = f"invalid include values: {bad}"
|
|
47
|
+
raise ToolError(msg, details={"invalid": bad})
|
|
48
|
+
|
|
49
|
+
vectors = await embedder.embed([query])
|
|
50
|
+
if not vectors:
|
|
51
|
+
msg = "embedder returned no vector for query"
|
|
52
|
+
raise ToolError(msg)
|
|
53
|
+
query_vec = vectors[0]
|
|
54
|
+
if len(query_vec) != index.vectors.dim:
|
|
55
|
+
msg = (
|
|
56
|
+
f"query embedding dim {len(query_vec)} != "
|
|
57
|
+
f"index dim {index.vectors.dim}"
|
|
58
|
+
)
|
|
59
|
+
raise ToolError(msg)
|
|
60
|
+
|
|
61
|
+
hits = await index.vectors.search(query_vec, k=k, scope_prefix=scope)
|
|
62
|
+
|
|
63
|
+
include_set = set(include)
|
|
64
|
+
results: list[dict[str, Any]] = []
|
|
65
|
+
for hit in hits:
|
|
66
|
+
node = index.tree.get(hit.id)
|
|
67
|
+
if node is None:
|
|
68
|
+
# Stale vector index entry: skip rather than fail the whole call.
|
|
69
|
+
continue
|
|
70
|
+
summary = index.summaries.get(hit.id)
|
|
71
|
+
result: dict[str, Any] = {
|
|
72
|
+
"id": hit.id,
|
|
73
|
+
"title": node.title,
|
|
74
|
+
"score": hit.score,
|
|
75
|
+
"anchor": index.anchor(hit.id),
|
|
76
|
+
}
|
|
77
|
+
if "synopsis" in include_set and summary is not None and summary.synopsis:
|
|
78
|
+
result["synopsis"] = summary.synopsis
|
|
79
|
+
if "head" in include_set:
|
|
80
|
+
result["head"] = node.raw_text[:_HEAD_CHARS]
|
|
81
|
+
if "evidence" in include_set:
|
|
82
|
+
result["evidence"] = _evidence_snippet(node.raw_text, query)
|
|
83
|
+
results.append(result)
|
|
84
|
+
|
|
85
|
+
payload: dict[str, Any] = {
|
|
86
|
+
"query": query,
|
|
87
|
+
"scope": scope,
|
|
88
|
+
"hits": results,
|
|
89
|
+
"cursor": None,
|
|
90
|
+
}
|
|
91
|
+
return ToolResponse(
|
|
92
|
+
data=payload,
|
|
93
|
+
tokens_returned=estimate_tokens_of_payload(payload),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _evidence_snippet(text: str, query: str) -> dict[str, Any]:
|
|
98
|
+
"""Return the strongest lexical evidence window for a semantic hit.
|
|
99
|
+
|
|
100
|
+
Semantic rank is vector-based, but a short lexical window gives humans and
|
|
101
|
+
agents a cheap explanation of what in the section may have caused the hit.
|
|
102
|
+
When there is no term overlap, fall back to the start of the section.
|
|
103
|
+
"""
|
|
104
|
+
clean_text = text.strip()
|
|
105
|
+
if not clean_text:
|
|
106
|
+
return {"text": "", "matched_terms": [], "span": {"start": 0, "end": 0}}
|
|
107
|
+
|
|
108
|
+
terms = _query_terms(query)
|
|
109
|
+
best_start = 0
|
|
110
|
+
best_score = 0
|
|
111
|
+
lowered = clean_text.lower()
|
|
112
|
+
if terms:
|
|
113
|
+
candidate_starts: set[int] = {0}
|
|
114
|
+
for term in terms:
|
|
115
|
+
for match in re.finditer(re.escape(term), lowered):
|
|
116
|
+
candidate_starts.add(max(0, match.start() - _EVIDENCE_CHARS // 3))
|
|
117
|
+
for start in candidate_starts:
|
|
118
|
+
end = min(len(clean_text), start + _EVIDENCE_CHARS)
|
|
119
|
+
window = lowered[start:end]
|
|
120
|
+
score = sum(window.count(term) for term in terms)
|
|
121
|
+
if score > best_score:
|
|
122
|
+
best_score = score
|
|
123
|
+
best_start = start
|
|
124
|
+
|
|
125
|
+
start = best_start
|
|
126
|
+
end = min(len(clean_text), start + _EVIDENCE_CHARS)
|
|
127
|
+
snippet = clean_text[start:end]
|
|
128
|
+
if start > 0:
|
|
129
|
+
snippet = "..." + snippet.lstrip()
|
|
130
|
+
if end < len(clean_text):
|
|
131
|
+
snippet = snippet.rstrip() + "..."
|
|
132
|
+
|
|
133
|
+
matched = [term for term in terms if term in lowered[start:end]]
|
|
134
|
+
return {
|
|
135
|
+
"text": snippet,
|
|
136
|
+
"matched_terms": matched,
|
|
137
|
+
"span": {"start": start, "end": end},
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _query_terms(query: str) -> list[str]:
|
|
142
|
+
query = query.lower()
|
|
143
|
+
stop = {
|
|
144
|
+
"a",
|
|
145
|
+
"an",
|
|
146
|
+
"and",
|
|
147
|
+
"are",
|
|
148
|
+
"for",
|
|
149
|
+
"how",
|
|
150
|
+
"is",
|
|
151
|
+
"of",
|
|
152
|
+
"or",
|
|
153
|
+
"the",
|
|
154
|
+
"to",
|
|
155
|
+
"what",
|
|
156
|
+
"where",
|
|
157
|
+
}
|
|
158
|
+
seen: set[str] = set()
|
|
159
|
+
out: list[str] = []
|
|
160
|
+
|
|
161
|
+
def add(term: str) -> None:
|
|
162
|
+
if term in seen or len(out) >= _MAX_EVIDENCE_TERMS:
|
|
163
|
+
return
|
|
164
|
+
seen.add(term)
|
|
165
|
+
out.append(term)
|
|
166
|
+
|
|
167
|
+
words = re.findall(r"[A-Za-z0-9_][A-Za-z0-9_-]*", query)
|
|
168
|
+
for word in words:
|
|
169
|
+
if len(word) < 3 or word in stop or word in seen:
|
|
170
|
+
continue
|
|
171
|
+
add(word)
|
|
172
|
+
|
|
173
|
+
for seq in re.findall(r"[\u3400-\u9fff]+", query):
|
|
174
|
+
if len(seq) >= 2:
|
|
175
|
+
add(seq)
|
|
176
|
+
# CJK queries often have no whitespace; bounded n-grams give the
|
|
177
|
+
# evidence window useful overlap without changing vector ranking.
|
|
178
|
+
for size in range(min(6, len(seq)), 1, -1):
|
|
179
|
+
for start in range(0, len(seq) - size + 1):
|
|
180
|
+
add(seq[start : start + size])
|
|
181
|
+
return out
|
cairn/xref/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Cross-reference extraction — directed edges between sections.
|
|
2
|
+
|
|
3
|
+
Three kinds of edges (per ARCHITECTURE.md §2.4):
|
|
4
|
+
|
|
5
|
+
- ``link``: explicit Markdown anchor links (``[text](#anchor)``)
|
|
6
|
+
- ``textual``: numeric section references (``"§ 2.5"``, ``"Section 3"``)
|
|
7
|
+
- ``entity``: sections that share a high-signal defined entity
|
|
8
|
+
|
|
9
|
+
The :class:`XRefExtractor` Protocol is the seam. The default
|
|
10
|
+
:class:`HeuristicXRefExtractor` produces all three kinds without any model
|
|
11
|
+
dependency, and accepts an optional Entities reader for entity-mediated
|
|
12
|
+
edges. LLM-verified textual references are a v0.2.3+ refinement.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from cairn.xref.base import ExtractionEdge, XRefExtractor
|
|
16
|
+
from cairn.xref.fake import FakeXRefExtractor
|
|
17
|
+
from cairn.xref.heuristic import HeuristicXRefExtractor
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"ExtractionEdge",
|
|
21
|
+
"FakeXRefExtractor",
|
|
22
|
+
"HeuristicXRefExtractor",
|
|
23
|
+
"XRefExtractor",
|
|
24
|
+
]
|
cairn/xref/base.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""XRefExtractor Protocol + intermediate ExtractionEdge type."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Awaitable, Iterable
|
|
6
|
+
from typing import Protocol, runtime_checkable
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
|
|
10
|
+
from cairn.core.types import Document, Span, XRefKind
|
|
11
|
+
from cairn.index.entities import Entities
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ExtractionEdge(BaseModel):
|
|
15
|
+
"""One observed cross-reference, before deduplication.
|
|
16
|
+
|
|
17
|
+
Spans use the same convention as :class:`cairn.entity.base.ExtractionHit`:
|
|
18
|
+
offsets within the *source section's* ``raw_text``. Self-loops (``src ==
|
|
19
|
+
dst``) are dropped by the :class:`cairn.index.xrefs.XRefBuilder`; do not
|
|
20
|
+
emit them.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
24
|
+
|
|
25
|
+
src: str
|
|
26
|
+
dst: str
|
|
27
|
+
kind: XRefKind
|
|
28
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
29
|
+
span: Span
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@runtime_checkable
|
|
33
|
+
class XRefExtractor(Protocol):
|
|
34
|
+
"""Pluggable cross-reference extractor.
|
|
35
|
+
|
|
36
|
+
The default :class:`cairn.xref.heuristic.HeuristicXRefExtractor` accepts
|
|
37
|
+
an optional ``Entities`` reader as a constructor argument; the Protocol
|
|
38
|
+
itself only mandates the ``extract`` shape.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
name: str
|
|
42
|
+
|
|
43
|
+
def extract(
|
|
44
|
+
self,
|
|
45
|
+
document: Document,
|
|
46
|
+
*,
|
|
47
|
+
entities: Entities | None = None,
|
|
48
|
+
) -> Awaitable[Iterable[ExtractionEdge]]:
|
|
49
|
+
"""Return an iterable of cross-reference edges across ``document``."""
|
|
50
|
+
...
|
cairn/xref/fake.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Deterministic cross-reference extractor for tests.
|
|
2
|
+
|
|
3
|
+
Emits one edge between each consecutive pair of sections (in document
|
|
4
|
+
order). Kind is always ``link``. Useful for tests that exercise builder /
|
|
5
|
+
reader / tool behavior without coupling to extraction heuristics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Iterable
|
|
11
|
+
|
|
12
|
+
from cairn.core.types import Document, Span
|
|
13
|
+
from cairn.index.entities import Entities
|
|
14
|
+
from cairn.xref.base import ExtractionEdge
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FakeXRefExtractor:
|
|
18
|
+
"""Linear edge between consecutive sections."""
|
|
19
|
+
|
|
20
|
+
name = "fake:linear"
|
|
21
|
+
|
|
22
|
+
async def extract(
|
|
23
|
+
self,
|
|
24
|
+
document: Document,
|
|
25
|
+
*,
|
|
26
|
+
entities: Entities | None = None,
|
|
27
|
+
) -> Iterable[ExtractionEdge]:
|
|
28
|
+
edges: list[ExtractionEdge] = []
|
|
29
|
+
sections = document.sections
|
|
30
|
+
for i in range(len(sections) - 1):
|
|
31
|
+
edges.append(
|
|
32
|
+
ExtractionEdge(
|
|
33
|
+
src=sections[i].id,
|
|
34
|
+
dst=sections[i + 1].id,
|
|
35
|
+
kind="link",
|
|
36
|
+
confidence=1.0,
|
|
37
|
+
span=Span(start=0, end=0),
|
|
38
|
+
)
|
|
39
|
+
)
|
|
40
|
+
return edges
|
cairn/xref/heuristic.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""HeuristicXRefExtractor — regex + entity-graph derivation, no model needed.
|
|
2
|
+
|
|
3
|
+
Combines three sources into one extractor (per ARCHITECTURE.md §2.4):
|
|
4
|
+
|
|
5
|
+
- **link**: explicit anchor links. ``[text](#anchor)`` resolves to a section
|
|
6
|
+
whose ``id`` ends in ``anchor``. Confidence 0.95 when unique, 0.75 when
|
|
7
|
+
multiple candidates exist.
|
|
8
|
+
- **textual**: numeric references like ``§ 2.5`` or ``Section 3.1``. Mapped
|
|
9
|
+
to a section whose ``title`` starts with the same numeric prefix.
|
|
10
|
+
Confidence 0.7.
|
|
11
|
+
- **entity**: pairs of sections that share a high-signal *defined* entity.
|
|
12
|
+
Confidence scales with the number of shared entities, capped at 0.8.
|
|
13
|
+
Computed only when an :class:`~cairn.index.entities.Entities` reader is
|
|
14
|
+
supplied.
|
|
15
|
+
|
|
16
|
+
Self-loops are filtered. Duplicate ``(src, dst, kind)`` triples are
|
|
17
|
+
deduplicated by the :class:`~cairn.index.xrefs.XRefBuilder`; this layer just
|
|
18
|
+
emits ``ExtractionEdge`` records.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import re
|
|
24
|
+
from collections.abc import Iterable, Iterator
|
|
25
|
+
from typing import Final
|
|
26
|
+
|
|
27
|
+
from cairn.core.types import Document, SectionNode, Span
|
|
28
|
+
from cairn.index.entities import Entities
|
|
29
|
+
from cairn.xref.base import ExtractionEdge
|
|
30
|
+
|
|
31
|
+
# Markdown anchor link: [text](#anchor) — anchor uses kebab/slug form.
|
|
32
|
+
_ANCHOR_LINK = re.compile(r"\[[^\]]+\]\(#([^)\s]+)\)")
|
|
33
|
+
|
|
34
|
+
# Section reference: "§ 2.5", "Section 3.1", "Chapter 4.2". Captures the
|
|
35
|
+
# numeric prefix only.
|
|
36
|
+
_SECTION_REF = re.compile(
|
|
37
|
+
r"(?:§\s*|(?:Section|Chapter|§)\s+)(\d+(?:\.\d+)*)\b",
|
|
38
|
+
re.IGNORECASE,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Confidence scores per ARCHITECTURE.md §2.4.
|
|
43
|
+
_LINK_CONF_UNIQUE: Final = 0.95
|
|
44
|
+
_LINK_CONF_AMBIGUOUS: Final = 0.75
|
|
45
|
+
_TEXTUAL_CONF: Final = 0.7
|
|
46
|
+
_ENTITY_CONF_BASE: Final = 0.3
|
|
47
|
+
_ENTITY_CONF_STEP: Final = 0.2
|
|
48
|
+
_ENTITY_CONF_CAP: Final = 0.8
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class HeuristicXRefExtractor:
|
|
52
|
+
"""Regex + entity-graph cross-reference extractor."""
|
|
53
|
+
|
|
54
|
+
name = "heuristic:xref-v1"
|
|
55
|
+
|
|
56
|
+
async def extract(
|
|
57
|
+
self,
|
|
58
|
+
document: Document,
|
|
59
|
+
*,
|
|
60
|
+
entities: Entities | None = None,
|
|
61
|
+
) -> Iterable[ExtractionEdge]:
|
|
62
|
+
sections = document.sections
|
|
63
|
+
if not sections:
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
# Pre-build lookups used by link + textual extractors.
|
|
67
|
+
anchor_to_ids = _build_anchor_index(sections)
|
|
68
|
+
prefix_to_id = _build_prefix_index(sections)
|
|
69
|
+
|
|
70
|
+
edges: list[ExtractionEdge] = []
|
|
71
|
+
for section in sections:
|
|
72
|
+
edges.extend(_scan_links(section, anchor_to_ids))
|
|
73
|
+
edges.extend(_scan_textual(section, prefix_to_id))
|
|
74
|
+
|
|
75
|
+
if entities is not None:
|
|
76
|
+
edges.extend(_entity_mediated(sections, entities))
|
|
77
|
+
|
|
78
|
+
return edges
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Link extraction
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _build_anchor_index(sections: tuple[SectionNode, ...]) -> dict[str, list[str]]:
|
|
87
|
+
"""Map heading-anchor → list of section_ids whose last slug equals it."""
|
|
88
|
+
index: dict[str, list[str]] = {}
|
|
89
|
+
for section in sections:
|
|
90
|
+
last = section.id.rsplit("/", 1)[-1]
|
|
91
|
+
index.setdefault(last, []).append(section.id)
|
|
92
|
+
return index
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _scan_links(
|
|
96
|
+
section: SectionNode,
|
|
97
|
+
anchor_to_ids: dict[str, list[str]],
|
|
98
|
+
) -> Iterator[ExtractionEdge]:
|
|
99
|
+
for match in _ANCHOR_LINK.finditer(section.raw_text):
|
|
100
|
+
anchor = match.group(1).lower()
|
|
101
|
+
candidates = anchor_to_ids.get(anchor, ())
|
|
102
|
+
if not candidates:
|
|
103
|
+
continue
|
|
104
|
+
unique = len(candidates) == 1
|
|
105
|
+
conf = _LINK_CONF_UNIQUE if unique else _LINK_CONF_AMBIGUOUS
|
|
106
|
+
for dst in candidates:
|
|
107
|
+
if dst == section.id:
|
|
108
|
+
continue
|
|
109
|
+
yield ExtractionEdge(
|
|
110
|
+
src=section.id,
|
|
111
|
+
dst=dst,
|
|
112
|
+
kind="link",
|
|
113
|
+
confidence=conf,
|
|
114
|
+
span=Span(start=match.start(1), end=match.end(1)),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
# Textual extraction
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _build_prefix_index(sections: tuple[SectionNode, ...]) -> dict[str, str]:
|
|
124
|
+
"""Map numeric title prefix (``"2.5"``) → section_id.
|
|
125
|
+
|
|
126
|
+
Only sections whose title starts with a digit-sequence are indexed.
|
|
127
|
+
First-seen wins on collision.
|
|
128
|
+
"""
|
|
129
|
+
index: dict[str, str] = {}
|
|
130
|
+
for section in sections:
|
|
131
|
+
prefix = _numeric_prefix(section.title)
|
|
132
|
+
if prefix and prefix not in index:
|
|
133
|
+
index[prefix] = section.id
|
|
134
|
+
return index
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
_TITLE_PREFIX = re.compile(r"^\s*(\d+(?:\.\d+)*)\b")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _numeric_prefix(title: str) -> str | None:
|
|
141
|
+
m = _TITLE_PREFIX.match(title)
|
|
142
|
+
return m.group(1) if m else None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _scan_textual(
|
|
146
|
+
section: SectionNode,
|
|
147
|
+
prefix_to_id: dict[str, str],
|
|
148
|
+
) -> Iterator[ExtractionEdge]:
|
|
149
|
+
for match in _SECTION_REF.finditer(section.raw_text):
|
|
150
|
+
number = match.group(1)
|
|
151
|
+
dst = prefix_to_id.get(number)
|
|
152
|
+
if dst is None or dst == section.id:
|
|
153
|
+
continue
|
|
154
|
+
yield ExtractionEdge(
|
|
155
|
+
src=section.id,
|
|
156
|
+
dst=dst,
|
|
157
|
+
kind="textual",
|
|
158
|
+
confidence=_TEXTUAL_CONF,
|
|
159
|
+
span=Span(start=match.start(1), end=match.end(1)),
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
# Entity-mediated extraction
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _entity_mediated(
|
|
169
|
+
sections: tuple[SectionNode, ...],
|
|
170
|
+
entities: Entities,
|
|
171
|
+
) -> Iterator[ExtractionEdge]:
|
|
172
|
+
"""Emit edges between sections that share defined entities.
|
|
173
|
+
|
|
174
|
+
For each defined entity that appears in 2+ sections, every ordered
|
|
175
|
+
section pair contributes one (or one extra) shared count. Confidence
|
|
176
|
+
rises with the count: 1 → 0.5, 2 → 0.7, 3+ → 0.8 (cap).
|
|
177
|
+
"""
|
|
178
|
+
# shared_counts[(src, dst)] = number of distinct defined entities in common
|
|
179
|
+
shared: dict[tuple[str, str], int] = {}
|
|
180
|
+
sample_span: dict[tuple[str, str], Span] = {}
|
|
181
|
+
|
|
182
|
+
for ent in entities.by_kind("defined"):
|
|
183
|
+
mention_sections = {m.section_id for m in ent.mentions}
|
|
184
|
+
if len(mention_sections) < 2:
|
|
185
|
+
continue
|
|
186
|
+
ordered = sorted(mention_sections)
|
|
187
|
+
for i, src in enumerate(ordered):
|
|
188
|
+
for dst in ordered[i + 1 :]:
|
|
189
|
+
shared[(src, dst)] = shared.get((src, dst), 0) + 1
|
|
190
|
+
shared[(dst, src)] = shared.get((dst, src), 0) + 1
|
|
191
|
+
# Pick *some* span — the first mention in the src section.
|
|
192
|
+
if (src, dst) not in sample_span:
|
|
193
|
+
for m in ent.mentions:
|
|
194
|
+
if m.section_id == src:
|
|
195
|
+
sample_span[(src, dst)] = m.span
|
|
196
|
+
break
|
|
197
|
+
if (dst, src) not in sample_span:
|
|
198
|
+
for m in ent.mentions:
|
|
199
|
+
if m.section_id == dst:
|
|
200
|
+
sample_span[(dst, src)] = m.span
|
|
201
|
+
break
|
|
202
|
+
|
|
203
|
+
for (src, dst), count in shared.items():
|
|
204
|
+
if src == dst:
|
|
205
|
+
continue
|
|
206
|
+
conf = min(
|
|
207
|
+
_ENTITY_CONF_CAP,
|
|
208
|
+
_ENTITY_CONF_BASE + _ENTITY_CONF_STEP * count,
|
|
209
|
+
)
|
|
210
|
+
span = sample_span.get((src, dst), Span(start=0, end=0))
|
|
211
|
+
yield ExtractionEdge(
|
|
212
|
+
src=src,
|
|
213
|
+
dst=dst,
|
|
214
|
+
kind="entity",
|
|
215
|
+
confidence=conf,
|
|
216
|
+
span=span,
|
|
217
|
+
)
|