trovex 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trovex/__init__.py +3 -0
- trovex/backup.py +58 -0
- trovex/boot.py +51 -0
- trovex/cache.py +75 -0
- trovex/capture.py +105 -0
- trovex/chunking.py +109 -0
- trovex/cli.py +675 -0
- trovex/config.py +99 -0
- trovex/db.py +270 -0
- trovex/embedder.py +105 -0
- trovex/indexer.py +205 -0
- trovex/insights.py +305 -0
- trovex/markdown.py +85 -0
- trovex/mcp_app.py +318 -0
- trovex/measure.py +121 -0
- trovex/rerank.py +147 -0
- trovex/savings.py +123 -0
- trovex/search.py +173 -0
- trovex/server.py +946 -0
- trovex/state.py +45 -0
- trovex/status.py +221 -0
- trovex/store.py +616 -0
- trovex/templates/_base.html +683 -0
- trovex/templates/_docs_table.html +87 -0
- trovex/templates/_partials.html +43 -0
- trovex/templates/_results.html +113 -0
- trovex/templates/doc.html +470 -0
- trovex/templates/docs.html +203 -0
- trovex/templates/home.html +699 -0
- trovex/templates/insights.html +1066 -0
- trovex/templates/install.html +1015 -0
- trovex/templates/savings.html +591 -0
- trovex/templates/search.html +473 -0
- trovex/templates/settings.html +383 -0
- trovex/templates/store.html +344 -0
- trovex/templates/usage.html +519 -0
- trovex/usage.py +113 -0
- trovex-0.11.0.dist-info/METADATA +159 -0
- trovex-0.11.0.dist-info/RECORD +42 -0
- trovex-0.11.0.dist-info/WHEEL +4 -0
- trovex-0.11.0.dist-info/entry_points.txt +2 -0
- trovex-0.11.0.dist-info/licenses/LICENSE +661 -0
trovex/__init__.py
ADDED
trovex/backup.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Online SQLite backups for the trovex store (now the sole corpus).
|
|
2
|
+
|
|
3
|
+
Uses the sqlite backup API for a consistent snapshot even while the server is
|
|
4
|
+
writing, after a WAL checkpoint. Keeps the last N, prunes the rest.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sqlite3
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
KEEP = 7 # the db is ~340MB (chunk vectors) — 7 daily = ~2.4GB
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def backup_dir(data_dir: Path) -> Path:
|
|
17
|
+
return data_dir / "backups"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def make_backup(db_path: Path, data_dir: Path) -> Path:
|
|
21
|
+
bdir = backup_dir(data_dir)
|
|
22
|
+
bdir.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
# Flush WAL into the main db so the snapshot is complete.
|
|
24
|
+
flush = sqlite3.connect(str(db_path))
|
|
25
|
+
try:
|
|
26
|
+
flush.execute("PRAGMA wal_checkpoint(TRUNCATE)")
|
|
27
|
+
finally:
|
|
28
|
+
flush.close()
|
|
29
|
+
dest = bdir / f"trovex-{time.strftime('%Y%m%d-%H%M%S')}.db"
|
|
30
|
+
src = sqlite3.connect(str(db_path))
|
|
31
|
+
dst = sqlite3.connect(str(dest))
|
|
32
|
+
try:
|
|
33
|
+
src.backup(dst) # consistent online copy
|
|
34
|
+
finally:
|
|
35
|
+
dst.close()
|
|
36
|
+
src.close()
|
|
37
|
+
prune(data_dir)
|
|
38
|
+
return dest
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def list_backups(data_dir: Path) -> list[dict]:
|
|
42
|
+
bdir = backup_dir(data_dir)
|
|
43
|
+
if not bdir.exists():
|
|
44
|
+
return []
|
|
45
|
+
out = []
|
|
46
|
+
for p in sorted(bdir.glob("trovex-*.db"), reverse=True):
|
|
47
|
+
st = p.stat()
|
|
48
|
+
out.append({"name": p.name, "size_bytes": st.st_size, "mtime": st.st_mtime})
|
|
49
|
+
return out
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def prune(data_dir: Path, keep: int = KEEP) -> int:
|
|
53
|
+
backups = sorted(backup_dir(data_dir).glob("trovex-*.db"), reverse=True)
|
|
54
|
+
removed = 0
|
|
55
|
+
for p in backups[keep:]:
|
|
56
|
+
p.unlink()
|
|
57
|
+
removed += 1
|
|
58
|
+
return removed
|
trovex/boot.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Active-memory boot recall (RFC 330e7d43, step 2).
|
|
2
|
+
|
|
3
|
+
Serves an agent its OWN recent records as a token-light pointer pack, scoped
|
|
4
|
+
server-side: owner/<agent> + kind=record. Scope first, score second — global
|
|
5
|
+
vector + an absolute threshold cross-injects; owner-scope yields precision≈1
|
|
6
|
+
by construction. The pack is ~80 tokens (titles + ids, not bodies); the agent
|
|
7
|
+
pulls a full record on demand via trovex_read(doc_id).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .search import Searcher
|
|
13
|
+
|
|
14
|
+
BOOT_QUERY = "current state resume open work in flight next steps gotchas"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def boot_pointers(
|
|
18
|
+
searcher: Searcher,
|
|
19
|
+
agent: str,
|
|
20
|
+
*,
|
|
21
|
+
k: int = 5,
|
|
22
|
+
floor: float = 0.62,
|
|
23
|
+
q: str | None = None,
|
|
24
|
+
) -> dict:
|
|
25
|
+
"""The agent's own records as a pointer pack. Empty (zero cost) when nothing
|
|
26
|
+
clears scope + floor — a session for an unknown agent injects nothing."""
|
|
27
|
+
results = searcher.search(
|
|
28
|
+
q or BOOT_QUERY,
|
|
29
|
+
limit=k,
|
|
30
|
+
source_ids=["trovex"],
|
|
31
|
+
kind="record",
|
|
32
|
+
tags=[f"owner/{agent}"],
|
|
33
|
+
)
|
|
34
|
+
results = [r for r in results if r.score >= floor]
|
|
35
|
+
if not results:
|
|
36
|
+
return {"agent": agent, "pointers": [], "render": "", "tokens_est": 0}
|
|
37
|
+
|
|
38
|
+
pointers = [
|
|
39
|
+
{"id": r.path, "title": r.title, "score": round(r.score, 3)}
|
|
40
|
+
for r in results
|
|
41
|
+
]
|
|
42
|
+
lines = [f"## Resume — {agent} (trovex active memory)"]
|
|
43
|
+
lines += [f"- {p['title']} (trovex:{p['id']})" for p in pointers]
|
|
44
|
+
lines.append("Pull any with trovex_read(doc_id) for the full record.")
|
|
45
|
+
render = "\n".join(lines)
|
|
46
|
+
return {
|
|
47
|
+
"agent": agent,
|
|
48
|
+
"pointers": pointers,
|
|
49
|
+
"render": render,
|
|
50
|
+
"tokens_est": max(1, len(render) // 4),
|
|
51
|
+
}
|
trovex/cache.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Exact-match query cache for the trovex() tool.
|
|
2
|
+
|
|
3
|
+
A repeat of the same query against an unchanged corpus skips the candidate
|
|
4
|
+
search + the LLM reranker (the cost driver). Keyed on (normalized query,
|
|
5
|
+
summary) + a corpus version derived from the docs table, so any trovex_write /
|
|
6
|
+
delete invalidates stale entries automatically — no write-path hook needed.
|
|
7
|
+
|
|
8
|
+
Exact match only: zero false-hit risk. Measured ~24% hit-rate on real traffic;
|
|
9
|
+
a semantic layer (cosine ≥ τ over cached query embeddings) can sit on top later
|
|
10
|
+
without changing this contract.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import sqlite3
|
|
17
|
+
import time
|
|
18
|
+
|
|
19
|
+
_WS = re.compile(r"\s+")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _norm(q: str) -> str:
|
|
23
|
+
return _WS.sub(" ", q.strip().lower())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _key(q: str, summary: bool) -> str:
|
|
27
|
+
return f"{_norm(q)}|{int(summary)}"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _ensure(db: sqlite3.Connection) -> None:
|
|
31
|
+
db.execute(
|
|
32
|
+
"""CREATE TABLE IF NOT EXISTS query_cache (
|
|
33
|
+
key TEXT PRIMARY KEY,
|
|
34
|
+
corpus_version TEXT NOT NULL,
|
|
35
|
+
output TEXT NOT NULL,
|
|
36
|
+
n_results INTEGER NOT NULL,
|
|
37
|
+
whr INTEGER NOT NULL,
|
|
38
|
+
top_tokens INTEGER NOT NULL,
|
|
39
|
+
resp_tokens INTEGER NOT NULL,
|
|
40
|
+
created_at REAL NOT NULL
|
|
41
|
+
)"""
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def corpus_version(db: sqlite3.Connection) -> str:
|
|
46
|
+
"""Cheap version string; changes on any write (mtime bumps) or delete (count)."""
|
|
47
|
+
r = db.execute("SELECT COUNT(*) AS c, COALESCE(MAX(mtime), 0) AS m FROM docs").fetchone()
|
|
48
|
+
return f"{r['c']}:{r['m']}"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get(db: sqlite3.Connection, q: str, summary: bool, version: str) -> dict | None:
|
|
52
|
+
_ensure(db)
|
|
53
|
+
row = db.execute(
|
|
54
|
+
"""SELECT output, n_results, whr, top_tokens, resp_tokens
|
|
55
|
+
FROM query_cache WHERE key = ? AND corpus_version = ?""",
|
|
56
|
+
(_key(q, summary), version),
|
|
57
|
+
).fetchone()
|
|
58
|
+
return dict(row) if row else None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def put(db: sqlite3.Connection, q: str, summary: bool, version: str, output: str,
|
|
62
|
+
n_results: int, whr: int, top_tokens: int, resp_tokens: int) -> None:
|
|
63
|
+
_ensure(db)
|
|
64
|
+
db.execute(
|
|
65
|
+
"""INSERT INTO query_cache
|
|
66
|
+
(key, corpus_version, output, n_results, whr, top_tokens, resp_tokens, created_at)
|
|
67
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
68
|
+
ON CONFLICT(key) DO UPDATE SET
|
|
69
|
+
corpus_version = excluded.corpus_version, output = excluded.output,
|
|
70
|
+
n_results = excluded.n_results, whr = excluded.whr,
|
|
71
|
+
top_tokens = excluded.top_tokens, resp_tokens = excluded.resp_tokens,
|
|
72
|
+
created_at = excluded.created_at""",
|
|
73
|
+
(_key(q, summary), version, output, n_results, whr, top_tokens, resp_tokens, time.time()),
|
|
74
|
+
)
|
|
75
|
+
db.commit()
|
trovex/capture.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Active-memory capture (RFC 330e7d43, steps 3-4).
|
|
2
|
+
|
|
3
|
+
Writes an agent's current-state record so the next /api/boot recalls FRESH state.
|
|
4
|
+
Two paths, in increasing risk:
|
|
5
|
+
|
|
6
|
+
- **free-summary** (step 3, frequent): PostCompact already distilled the
|
|
7
|
+
conversation — store that summary verbatim, NO LLM.
|
|
8
|
+
- **transcript distil** (step 4, fallback for sessions with no compaction): an
|
|
9
|
+
LLM compresses the transcript. BYOK + best-effort (no key / error → no
|
|
10
|
+
capture, never raises). MERGES with the agent's prior state so a truncated
|
|
11
|
+
window doesn't lose earlier work (RFC residual bet #2: 24k → merge).
|
|
12
|
+
|
|
13
|
+
Both upsert the deterministic doc ``owner-<agent>-current-state`` (owner/<agent>
|
|
14
|
+
+ kind=record + type/current-state) — stable id ⇒ in-place overwrite, one
|
|
15
|
+
canonical record, no dup pile.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
|
|
23
|
+
from openai import OpenAI
|
|
24
|
+
|
|
25
|
+
from .store import SqliteStore
|
|
26
|
+
from .usage import current_openai_key, current_rerank_model
|
|
27
|
+
|
|
28
|
+
log = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
DISTIL_MODEL = os.environ.get("TROVEX_DISTIL_MODEL", "gpt-5.4-mini")
|
|
31
|
+
DISTIL_TIMEOUT_SEC = 20.0
|
|
32
|
+
MAX_TRANSCRIPT_CHARS = 24000
|
|
33
|
+
|
|
34
|
+
DISTIL_SYSTEM = (
|
|
35
|
+
"You compress one coding-agent session into a durable current-state record. "
|
|
36
|
+
"You are given the agent's PRIOR state (may be empty) and the RECENT session "
|
|
37
|
+
"transcript. Produce the UPDATED state as markdown with ONLY these sections, "
|
|
38
|
+
"omitting any that are empty:\n"
|
|
39
|
+
"### Done this session\n### In flight (verify/continue)\n### Gotchas (don't repeat)\n"
|
|
40
|
+
"### Next\n### Pointers (trovex ids / files)\n"
|
|
41
|
+
"Merge: carry forward still-relevant prior items, add new ones, drop done/stale. "
|
|
42
|
+
"Terse, facts only, no narration. If nothing durable, output exactly NO-SIGNAL."
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def distil_summary(transcript: str, *, prior: str = "") -> str | None:
|
|
47
|
+
"""LLM-distil a transcript into a current-state summary, merged with prior
|
|
48
|
+
state. BYOK + best-effort: no key, short input, or any error → None (caller
|
|
49
|
+
falls back). Never raises into the caller."""
|
|
50
|
+
key = current_openai_key.get()
|
|
51
|
+
transcript = (transcript or "").strip()
|
|
52
|
+
if not key or len(transcript) < 40:
|
|
53
|
+
return None
|
|
54
|
+
model = current_rerank_model.get() or DISTIL_MODEL
|
|
55
|
+
window = transcript[-MAX_TRANSCRIPT_CHARS:]
|
|
56
|
+
user = f"PRIOR STATE:\n{prior or '(none)'}\n\nRECENT TRANSCRIPT:\n{window}"
|
|
57
|
+
params: dict = {
|
|
58
|
+
"model": model,
|
|
59
|
+
"messages": [
|
|
60
|
+
{"role": "system", "content": DISTIL_SYSTEM},
|
|
61
|
+
{"role": "user", "content": user},
|
|
62
|
+
],
|
|
63
|
+
}
|
|
64
|
+
if model.startswith(("gpt-5", "o1", "o3", "o4")):
|
|
65
|
+
params["max_completion_tokens"] = 2048
|
|
66
|
+
else:
|
|
67
|
+
params["max_tokens"] = 1024
|
|
68
|
+
params["temperature"] = 0
|
|
69
|
+
try:
|
|
70
|
+
client = OpenAI(api_key=key, timeout=DISTIL_TIMEOUT_SEC)
|
|
71
|
+
resp = client.chat.completions.create(**params)
|
|
72
|
+
except Exception: # best-effort, never block the agent
|
|
73
|
+
log.warning("distil failed")
|
|
74
|
+
return None
|
|
75
|
+
md = (resp.choices[0].message.content or "").strip()
|
|
76
|
+
if md == "NO-SIGNAL" or len(md) < 40:
|
|
77
|
+
return None
|
|
78
|
+
return md
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def capture_state(
|
|
82
|
+
store: SqliteStore,
|
|
83
|
+
agent: str,
|
|
84
|
+
summary: str = "",
|
|
85
|
+
*,
|
|
86
|
+
transcript: str = "",
|
|
87
|
+
reason: str = "postcompact",
|
|
88
|
+
) -> dict:
|
|
89
|
+
summary = (summary or "").strip()
|
|
90
|
+
# Free path takes the summary verbatim; transcript path distils (merging the
|
|
91
|
+
# existing record forward so truncation doesn't lose earlier state).
|
|
92
|
+
if not summary and transcript:
|
|
93
|
+
existing = store.get(f"owner-{agent}-current-state")
|
|
94
|
+
summary = distil_summary(transcript, prior=existing.content if existing else "") or ""
|
|
95
|
+
if len(summary) < 20:
|
|
96
|
+
return {"captured": False, "reason": "no durable signal"}
|
|
97
|
+
doc_id = f"owner-{agent}-current-state"
|
|
98
|
+
content = f"# {agent} — current state ({reason})\n\n{summary}"
|
|
99
|
+
store.put(
|
|
100
|
+
content,
|
|
101
|
+
kind="record",
|
|
102
|
+
ext_id=doc_id,
|
|
103
|
+
tags=[f"owner/{agent}", "type/current-state", f"capture/{reason}"],
|
|
104
|
+
)
|
|
105
|
+
return {"captured": True, "doc_id": doc_id, "tokens": max(1, len(content) // 4)}
|
trovex/chunking.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Structure-aware markdown chunking for chunk-level retrieval.
|
|
2
|
+
|
|
3
|
+
The literature consensus (arXiv:2603.24556, 2606.00881) is that structure-aware
|
|
4
|
+
chunking beats semantic/sliding-window at lower cost — and markdown gives us the
|
|
5
|
+
structure for free. We split on headings, keep a heading breadcrumb per chunk,
|
|
6
|
+
and resplit oversized sections by paragraph windows.
|
|
7
|
+
|
|
8
|
+
Each chunk's embed text is *prefix-fused* with its breadcrumb ("title > h1 > h2")
|
|
9
|
+
— the single biggest retrieval gain per arXiv:2510.24402, kept small ("seasoning",
|
|
10
|
+
not a metadata dump) per the same line of work.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
|
|
18
|
+
FRONTMATTER_RE = re.compile(r"^---\s*\n.*?\n---\s*\n", re.DOTALL)
|
|
19
|
+
HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*#*$")
|
|
20
|
+
FENCE_RE = re.compile(r"^\s*```")
|
|
21
|
+
|
|
22
|
+
DEFAULT_MAX_TOKENS = 450
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class Chunk:
|
|
27
|
+
index: int
|
|
28
|
+
heading_path: list[str] = field(default_factory=list)
|
|
29
|
+
text: str = ""
|
|
30
|
+
tokens_est: int = 0
|
|
31
|
+
|
|
32
|
+
def breadcrumb(self, title: str = "") -> str:
|
|
33
|
+
parts = ([title] if title else []) + self.heading_path
|
|
34
|
+
return " > ".join(p for p in parts if p)
|
|
35
|
+
|
|
36
|
+
def embed_text(self, title: str = "") -> str:
|
|
37
|
+
"""Prefix-fusion: breadcrumb + body — the text we actually embed."""
|
|
38
|
+
bc = self.breadcrumb(title)
|
|
39
|
+
return f"{bc}\n\n{self.text}" if bc else self.text
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _est_tokens(text: str) -> int:
|
|
43
|
+
return len(text) // 4
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _split_to_size(text: str, max_tokens: int) -> list[str]:
|
|
47
|
+
if _est_tokens(text) <= max_tokens:
|
|
48
|
+
return [text]
|
|
49
|
+
paras = re.split(r"\n\s*\n", text)
|
|
50
|
+
out: list[str] = []
|
|
51
|
+
cur: list[str] = []
|
|
52
|
+
cur_tok = 0
|
|
53
|
+
for p in paras:
|
|
54
|
+
pt = _est_tokens(p)
|
|
55
|
+
if cur and cur_tok + pt > max_tokens:
|
|
56
|
+
out.append("\n\n".join(cur))
|
|
57
|
+
cur, cur_tok = [], 0
|
|
58
|
+
cur.append(p)
|
|
59
|
+
cur_tok += pt
|
|
60
|
+
if cur_tok > max_tokens and len(cur) == 1: # lone oversized paragraph
|
|
61
|
+
out.append("\n\n".join(cur))
|
|
62
|
+
cur, cur_tok = [], 0
|
|
63
|
+
if cur:
|
|
64
|
+
out.append("\n\n".join(cur))
|
|
65
|
+
return [o for o in (s.strip() for s in out) if o]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def chunk_markdown(content: str, max_tokens: int = DEFAULT_MAX_TOKENS) -> list[Chunk]:
|
|
69
|
+
"""Split markdown into structure-aware chunks with heading breadcrumbs."""
|
|
70
|
+
content = FRONTMATTER_RE.sub("", content)
|
|
71
|
+
lines = content.splitlines()
|
|
72
|
+
|
|
73
|
+
sections: list[tuple[list[str], str]] = []
|
|
74
|
+
stack: list[tuple[int, str]] = [] # (level, heading text)
|
|
75
|
+
path: list[str] = []
|
|
76
|
+
body: list[str] = []
|
|
77
|
+
in_fence = False
|
|
78
|
+
|
|
79
|
+
def flush() -> None:
|
|
80
|
+
text = "\n".join(body).strip()
|
|
81
|
+
if text:
|
|
82
|
+
sections.append((list(path), text))
|
|
83
|
+
|
|
84
|
+
for line in lines:
|
|
85
|
+
if FENCE_RE.match(line):
|
|
86
|
+
in_fence = not in_fence
|
|
87
|
+
body.append(line)
|
|
88
|
+
continue
|
|
89
|
+
m = None if in_fence else HEADING_RE.match(line)
|
|
90
|
+
if m:
|
|
91
|
+
flush()
|
|
92
|
+
body = []
|
|
93
|
+
level, htext = len(m.group(1)), m.group(2).strip()
|
|
94
|
+
while stack and stack[-1][0] >= level:
|
|
95
|
+
stack.pop()
|
|
96
|
+
stack.append((level, htext))
|
|
97
|
+
path = [t for _, t in stack]
|
|
98
|
+
else:
|
|
99
|
+
body.append(line)
|
|
100
|
+
flush()
|
|
101
|
+
|
|
102
|
+
chunks: list[Chunk] = []
|
|
103
|
+
for sec_path, text in sections:
|
|
104
|
+
for piece in _split_to_size(text, max_tokens):
|
|
105
|
+
chunks.append(Chunk(
|
|
106
|
+
index=len(chunks), heading_path=sec_path,
|
|
107
|
+
text=piece, tokens_est=_est_tokens(piece),
|
|
108
|
+
))
|
|
109
|
+
return chunks
|