haiku.rag 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku/rag/app.py +50 -14
- haiku/rag/cli.py +16 -4
- haiku/rag/client.py +3 -5
- haiku/rag/reranking/mxbai.py +1 -1
- haiku/rag/research/__init__.py +10 -27
- haiku/rag/research/common.py +53 -0
- haiku/rag/research/dependencies.py +5 -3
- haiku/rag/research/graph.py +29 -0
- haiku/rag/research/models.py +70 -0
- haiku/rag/research/nodes/evaluate.py +80 -0
- haiku/rag/research/nodes/plan.py +63 -0
- haiku/rag/research/nodes/search.py +91 -0
- haiku/rag/research/nodes/synthesize.py +51 -0
- haiku/rag/research/prompts.py +97 -113
- haiku/rag/research/state.py +25 -0
- haiku/rag/store/engine.py +42 -17
- haiku/rag/store/models/chunk.py +1 -0
- haiku/rag/store/repositories/chunk.py +60 -39
- haiku/rag/store/repositories/document.py +2 -2
- haiku/rag/store/repositories/settings.py +12 -5
- haiku/rag/store/upgrades/__init__.py +60 -1
- haiku/rag/store/upgrades/v0_9_3.py +112 -0
- {haiku_rag-0.9.2.dist-info → haiku_rag-0.10.0.dist-info}/METADATA +37 -1
- haiku_rag-0.10.0.dist-info/RECORD +53 -0
- haiku/rag/research/base.py +0 -130
- haiku/rag/research/evaluation_agent.py +0 -42
- haiku/rag/research/orchestrator.py +0 -300
- haiku/rag/research/presearch_agent.py +0 -34
- haiku/rag/research/search_agent.py +0 -65
- haiku/rag/research/synthesis_agent.py +0 -40
- haiku_rag-0.9.2.dist-info/RECORD +0 -50
- {haiku_rag-0.9.2.dist-info → haiku_rag-0.10.0.dist-info}/WHEEL +0 -0
- {haiku_rag-0.9.2.dist-info → haiku_rag-0.10.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.9.2.dist-info → haiku_rag-0.10.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic_ai import Agent
|
|
4
|
+
from pydantic_graph import BaseNode, End, GraphRunContext
|
|
5
|
+
|
|
6
|
+
from haiku.rag.research.common import format_context_for_prompt, get_model, log
|
|
7
|
+
from haiku.rag.research.dependencies import (
|
|
8
|
+
ResearchDependencies,
|
|
9
|
+
)
|
|
10
|
+
from haiku.rag.research.models import ResearchReport
|
|
11
|
+
from haiku.rag.research.prompts import SYNTHESIS_AGENT_PROMPT
|
|
12
|
+
from haiku.rag.research.state import ResearchDeps, ResearchState
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SynthesizeNode(BaseNode[ResearchState, ResearchDeps, ResearchReport]):
|
|
17
|
+
provider: str
|
|
18
|
+
model: str
|
|
19
|
+
|
|
20
|
+
async def run(
|
|
21
|
+
self, ctx: GraphRunContext[ResearchState, ResearchDeps]
|
|
22
|
+
) -> End[ResearchReport]:
|
|
23
|
+
state = ctx.state
|
|
24
|
+
deps = ctx.deps
|
|
25
|
+
|
|
26
|
+
log(
|
|
27
|
+
deps.console,
|
|
28
|
+
"\n[bold cyan]📝 Generating final research report...[/bold cyan]",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
agent = Agent(
|
|
32
|
+
model=get_model(self.provider, self.model),
|
|
33
|
+
output_type=ResearchReport,
|
|
34
|
+
instructions=SYNTHESIS_AGENT_PROMPT,
|
|
35
|
+
retries=3,
|
|
36
|
+
deps_type=ResearchDependencies,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
context_xml = format_context_for_prompt(state.context)
|
|
40
|
+
prompt = (
|
|
41
|
+
"Generate a comprehensive research report based on all gathered information.\n\n"
|
|
42
|
+
f"{context_xml}\n\n"
|
|
43
|
+
"Create a detailed report that synthesizes all findings into a coherent response."
|
|
44
|
+
)
|
|
45
|
+
agent_deps = ResearchDependencies(
|
|
46
|
+
client=deps.client, context=state.context, console=deps.console
|
|
47
|
+
)
|
|
48
|
+
result = await agent.run(prompt, deps=agent_deps)
|
|
49
|
+
|
|
50
|
+
log(deps.console, "[bold green]✅ Research complete![/bold green]")
|
|
51
|
+
return End(result.output)
|
haiku/rag/research/prompts.py
CHANGED
|
@@ -1,129 +1,113 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
-
|
|
12
|
-
- Each
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
SEARCH_AGENT_PROMPT = """You are a search and question
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
1. Search the knowledge base for relevant
|
|
24
|
-
2. Analyze
|
|
25
|
-
3. Provide an
|
|
26
|
-
|
|
27
|
-
Output format:
|
|
28
|
-
- You must return a SearchAnswer model with fields:
|
|
29
|
-
- query: the question being answered (echo the user query)
|
|
30
|
-
- answer: your final answer based only on the provided context
|
|
31
|
-
- context: list[str] of only the minimal set of verbatim snippet texts you
|
|
32
|
-
used to justify the answer (do not include unrelated text; do not invent)
|
|
33
|
-
- sources: list[str] of document_uri values corresponding to the snippets you
|
|
34
|
-
actually used in the answer (one URI per context snippet, order aligned)
|
|
1
|
+
PLAN_PROMPT = """You are the research orchestrator for a focused, iterative
|
|
2
|
+
workflow.
|
|
3
|
+
|
|
4
|
+
Responsibilities:
|
|
5
|
+
1. Understand and decompose the main question
|
|
6
|
+
2. Propose a minimal, high‑leverage plan
|
|
7
|
+
3. Coordinate specialized agents to gather evidence
|
|
8
|
+
4. Iterate based on gaps and new findings
|
|
9
|
+
|
|
10
|
+
Plan requirements:
|
|
11
|
+
- Produce at most 3 sub_questions that together cover the main question.
|
|
12
|
+
- Each sub_question must be a standalone, self‑contained query that can run
|
|
13
|
+
without extra context. Include concrete entities, scope, timeframe, and any
|
|
14
|
+
qualifiers. Avoid ambiguous pronouns (it/they/this/that).
|
|
15
|
+
- Prioritize the highest‑value aspects first; avoid redundancy and overlap.
|
|
16
|
+
- Prefer questions that are likely answerable from the current knowledge base;
|
|
17
|
+
if coverage is uncertain, make scopes narrower and specific.
|
|
18
|
+
- Order sub_questions by execution priority (most valuable first)."""
|
|
19
|
+
|
|
20
|
+
SEARCH_AGENT_PROMPT = """You are a search and question‑answering specialist.
|
|
21
|
+
|
|
22
|
+
Tasks:
|
|
23
|
+
1. Search the knowledge base for relevant evidence.
|
|
24
|
+
2. Analyze retrieved snippets.
|
|
25
|
+
3. Provide an answer strictly grounded in that evidence.
|
|
35
26
|
|
|
36
27
|
Tool usage:
|
|
37
|
-
- Always call
|
|
38
|
-
- The tool returns
|
|
39
|
-
|
|
40
|
-
`document_uri` it came from.
|
|
28
|
+
- Always call search_and_answer before drafting any answer.
|
|
29
|
+
- The tool returns snippets with verbatim `text`, a relevance `score`, and the
|
|
30
|
+
originating `document_uri`.
|
|
41
31
|
- You may call the tool multiple times to refine or broaden context, but do not
|
|
42
|
-
exceed 3 total
|
|
32
|
+
exceed 3 total calls. Favor precision over volume.
|
|
43
33
|
- Use scores to prioritize evidence, but include only the minimal subset of
|
|
44
|
-
snippet texts (verbatim) in SearchAnswer.context.
|
|
45
|
-
- Set SearchAnswer.sources to the
|
|
46
|
-
used (one URI per snippet
|
|
47
|
-
- If no relevant information is found, say so and return an empty
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
34
|
+
snippet texts (verbatim) in SearchAnswer.context (typically 1‑4).
|
|
35
|
+
- Set SearchAnswer.sources to the corresponding document_uris for the snippets
|
|
36
|
+
you used (one URI per snippet; same order as context). Context must be text‑only.
|
|
37
|
+
- If no relevant information is found, clearly say so and return an empty
|
|
38
|
+
context list and sources list.
|
|
39
|
+
|
|
40
|
+
Answering rules:
|
|
41
|
+
- Be direct and specific; avoid meta commentary about the process.
|
|
42
|
+
- Do not include any claims not supported by the provided snippets.
|
|
43
|
+
- Prefer concise phrasing; avoid copying long passages.
|
|
44
|
+
- When evidence is partial, state the limits explicitly in the answer."""
|
|
45
|
+
|
|
46
|
+
EVALUATION_AGENT_PROMPT = """You are an analysis and evaluation specialist for
|
|
47
|
+
the research workflow.
|
|
48
|
+
|
|
49
|
+
Inputs available:
|
|
50
|
+
- Original research question
|
|
51
|
+
- Question–answer pairs produced by search
|
|
52
|
+
- Raw search results and source metadata
|
|
59
53
|
- Previously identified insights
|
|
60
54
|
|
|
61
|
-
Your dual role is to:
|
|
62
|
-
|
|
63
55
|
ANALYSIS:
|
|
64
|
-
1. Extract
|
|
65
|
-
2. Identify patterns and
|
|
66
|
-
3.
|
|
67
|
-
4. Focus on the most important discoveries
|
|
56
|
+
1. Extract the most important, non‑obvious insights from the collected evidence.
|
|
57
|
+
2. Identify patterns, agreements, and disagreements across sources.
|
|
58
|
+
3. Note material uncertainties and assumptions.
|
|
68
59
|
|
|
69
60
|
EVALUATION:
|
|
70
|
-
1.
|
|
71
|
-
2.
|
|
72
|
-
- Coverage of the main question
|
|
73
|
-
- Quality and
|
|
74
|
-
- Depth of
|
|
75
|
-
3.
|
|
76
|
-
4.
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
-
|
|
86
|
-
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
-
|
|
109
|
-
-
|
|
110
|
-
-
|
|
111
|
-
|
|
112
|
-
Focus on creating a report that provides clear value to the reader by:
|
|
113
|
-
- Answering the original research question thoroughly
|
|
114
|
-
- Highlighting the most important findings
|
|
115
|
-
- Explaining the implications of the research
|
|
116
|
-
- Suggesting concrete next steps"""
|
|
61
|
+
1. Decide if we have sufficient information to answer the original question.
|
|
62
|
+
2. Provide a confidence_score in [0,1] considering:
|
|
63
|
+
- Coverage of the main question’s aspects
|
|
64
|
+
- Quality, consistency, and diversity of sources
|
|
65
|
+
- Depth and specificity of evidence
|
|
66
|
+
3. List concrete gaps that still need investigation.
|
|
67
|
+
4. Propose up to 3 new sub_questions that would close the highest‑value gaps.
|
|
68
|
+
|
|
69
|
+
Strictness:
|
|
70
|
+
- Only mark research as sufficient when all major aspects are addressed with
|
|
71
|
+
consistent, reliable evidence and no critical gaps remain.
|
|
72
|
+
|
|
73
|
+
New sub_questions must:
|
|
74
|
+
- Be genuinely new (not answered or duplicative; check qa_responses).
|
|
75
|
+
- Be standalone and specific (entities, scope, timeframe/region if relevant).
|
|
76
|
+
- Be actionable and scoped to the knowledge base (narrow if necessary).
|
|
77
|
+
- Be ordered by expected impact (most valuable first)."""
|
|
78
|
+
|
|
79
|
+
SYNTHESIS_AGENT_PROMPT = """You are a synthesis specialist producing the final
|
|
80
|
+
research report.
|
|
81
|
+
|
|
82
|
+
Goals:
|
|
83
|
+
1. Synthesize all gathered information into a coherent narrative.
|
|
84
|
+
2. Present findings clearly and concisely.
|
|
85
|
+
3. Draw evidence‑based conclusions and recommendations.
|
|
86
|
+
4. State limitations and uncertainties transparently.
|
|
87
|
+
|
|
88
|
+
Report guidelines (map to output fields):
|
|
89
|
+
- title: concise (5–12 words), informative.
|
|
90
|
+
- executive_summary: 3–5 sentences summarizing the overall answer.
|
|
91
|
+
- main_findings: 4–8 one‑sentence bullets; each reflects evidence from the
|
|
92
|
+
research (do not include inline citations or snippet text).
|
|
93
|
+
- conclusions: 2–4 bullets that follow logically from findings.
|
|
94
|
+
- recommendations: 2–5 actionable bullets tied to findings.
|
|
95
|
+
- limitations: 1–3 bullets describing key constraints or uncertainties.
|
|
96
|
+
- sources_summary: 2–4 sentences summarizing sources used and their reliability.
|
|
97
|
+
|
|
98
|
+
Style:
|
|
99
|
+
- Base all content solely on the collected evidence.
|
|
100
|
+
- Be professional, objective, and specific.
|
|
101
|
+
- Avoid meta commentary and refrain from speculation beyond the evidence."""
|
|
117
102
|
|
|
118
103
|
PRESEARCH_AGENT_PROMPT = """You are a rapid research surveyor.
|
|
119
104
|
|
|
120
105
|
Task:
|
|
121
|
-
- Call
|
|
122
|
-
|
|
123
|
-
- Read that context and produce a
|
|
124
|
-
|
|
106
|
+
- Call gather_context once on the main question to obtain relevant text from
|
|
107
|
+
the knowledge base (KB).
|
|
108
|
+
- Read that context and produce a short natural‑language summary of what the
|
|
109
|
+
KB appears to contain relative to the question.
|
|
125
110
|
|
|
126
111
|
Rules:
|
|
127
112
|
- Base the summary strictly on the provided text; do not invent.
|
|
128
|
-
- Output only the summary as plain text (one short paragraph).
|
|
129
|
-
"""
|
|
113
|
+
- Output only the summary as plain text (one short paragraph)."""
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
|
|
5
|
+
from haiku.rag.client import HaikuRAG
|
|
6
|
+
from haiku.rag.research.dependencies import ResearchContext
|
|
7
|
+
from haiku.rag.research.models import EvaluationResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ResearchDeps:
|
|
12
|
+
client: HaikuRAG
|
|
13
|
+
console: Console | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ResearchState:
|
|
18
|
+
question: str
|
|
19
|
+
context: ResearchContext
|
|
20
|
+
sub_questions: list[str] = field(default_factory=list)
|
|
21
|
+
iterations: int = 0
|
|
22
|
+
max_iterations: int = 3
|
|
23
|
+
max_concurrency: int = 1
|
|
24
|
+
confidence_threshold: float = 0.8
|
|
25
|
+
last_eval: EvaluationResult | None = None
|
haiku/rag/store/engine.py
CHANGED
|
@@ -35,6 +35,7 @@ def create_chunk_model(vector_dim: int):
|
|
|
35
35
|
document_id: str
|
|
36
36
|
content: str
|
|
37
37
|
metadata: str = Field(default="{}")
|
|
38
|
+
order: int = Field(default=0)
|
|
38
39
|
vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
|
|
39
40
|
|
|
40
41
|
return ChunkRecord
|
|
@@ -117,8 +118,10 @@ class Store:
|
|
|
117
118
|
self.chunks_table = self.db.open_table("chunks")
|
|
118
119
|
else:
|
|
119
120
|
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
120
|
-
# Create FTS index on the new table
|
|
121
|
-
self.chunks_table.create_fts_index(
|
|
121
|
+
# Create FTS index on the new table with phrase query support
|
|
122
|
+
self.chunks_table.create_fts_index(
|
|
123
|
+
"content", replace=True, with_position=True, remove_stop_words=False
|
|
124
|
+
)
|
|
122
125
|
|
|
123
126
|
# Create or get settings table
|
|
124
127
|
if "settings" in existing_tables:
|
|
@@ -133,21 +136,41 @@ class Store:
|
|
|
133
136
|
[SettingsRecord(id="settings", settings=json.dumps(settings_data))]
|
|
134
137
|
)
|
|
135
138
|
|
|
136
|
-
#
|
|
137
|
-
current_version = metadata.version("haiku.rag")
|
|
138
|
-
self.set_haiku_version(current_version)
|
|
139
|
-
|
|
140
|
-
# Check if we need to perform upgrades
|
|
139
|
+
# Run pending upgrades based on stored version and package version
|
|
141
140
|
try:
|
|
142
|
-
|
|
143
|
-
|
|
141
|
+
from haiku.rag.store.upgrades import run_pending_upgrades
|
|
142
|
+
|
|
143
|
+
current_version = metadata.version("haiku.rag")
|
|
144
|
+
db_version = self.get_haiku_version()
|
|
145
|
+
|
|
146
|
+
run_pending_upgrades(self, db_version, current_version)
|
|
147
|
+
|
|
148
|
+
# After upgrades complete (or if none), set stored version
|
|
149
|
+
# to the greater of the installed package version and the
|
|
150
|
+
# highest available upgrade step version in code.
|
|
151
|
+
try:
|
|
152
|
+
from packaging.version import parse as _v
|
|
153
|
+
|
|
154
|
+
from haiku.rag.store.upgrades import upgrades as _steps
|
|
155
|
+
|
|
156
|
+
highest_step = max((_v(u.version) for u in _steps), default=None)
|
|
157
|
+
effective_version = (
|
|
158
|
+
str(max(_v(current_version), highest_step))
|
|
159
|
+
if highest_step is not None
|
|
160
|
+
else current_version
|
|
161
|
+
)
|
|
162
|
+
except Exception:
|
|
163
|
+
effective_version = current_version
|
|
164
|
+
|
|
165
|
+
self.set_haiku_version(effective_version)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
# Avoid hard failure on initial connection; log and continue so CLI remains usable.
|
|
168
|
+
logger.warning(
|
|
169
|
+
"Skipping upgrade due to error (db=%s -> pkg=%s): %s",
|
|
170
|
+
self.get_haiku_version(),
|
|
171
|
+
metadata.version("haiku.rag") if hasattr(metadata, "version") else "",
|
|
172
|
+
e,
|
|
144
173
|
)
|
|
145
|
-
if existing_settings:
|
|
146
|
-
db_version = self.get_haiku_version() # noqa: F841
|
|
147
|
-
# TODO: Add upgrade logic here similar to SQLite version when needed
|
|
148
|
-
except Exception:
|
|
149
|
-
# Settings table might not exist yet in fresh databases
|
|
150
|
-
pass
|
|
151
174
|
|
|
152
175
|
def get_haiku_version(self) -> str:
|
|
153
176
|
"""Returns the user version stored in settings."""
|
|
@@ -201,8 +224,10 @@ class Store:
|
|
|
201
224
|
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
202
225
|
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
203
226
|
|
|
204
|
-
# Create FTS index on the new table
|
|
205
|
-
self.chunks_table.create_fts_index(
|
|
227
|
+
# Create FTS index on the new table with phrase query support
|
|
228
|
+
self.chunks_table.create_fts_index(
|
|
229
|
+
"content", replace=True, with_position=True, remove_stop_words=False
|
|
230
|
+
)
|
|
206
231
|
|
|
207
232
|
def close(self):
|
|
208
233
|
"""Close the database connection."""
|
haiku/rag/store/models/chunk.py
CHANGED
|
@@ -28,7 +28,9 @@ class ChunkRepository:
|
|
|
28
28
|
def _ensure_fts_index(self) -> None:
|
|
29
29
|
"""Ensure FTS index exists on the content column."""
|
|
30
30
|
try:
|
|
31
|
-
self.store.chunks_table.create_fts_index(
|
|
31
|
+
self.store.chunks_table.create_fts_index(
|
|
32
|
+
"content", replace=True, with_position=True, remove_stop_words=False
|
|
33
|
+
)
|
|
32
34
|
except Exception as e:
|
|
33
35
|
# Log the error but don't fail - FTS might already exist
|
|
34
36
|
logger.debug(f"FTS index creation skipped: {e}")
|
|
@@ -59,11 +61,16 @@ class ChunkRepository:
|
|
|
59
61
|
embedding = entity.embedding
|
|
60
62
|
else:
|
|
61
63
|
embedding = await self.embedder.embed(entity.content)
|
|
64
|
+
order_val = int(entity.order)
|
|
65
|
+
|
|
62
66
|
chunk_record = self.store.ChunkRecord(
|
|
63
67
|
id=chunk_id,
|
|
64
68
|
document_id=entity.document_id,
|
|
65
69
|
content=entity.content,
|
|
66
|
-
metadata=json.dumps(
|
|
70
|
+
metadata=json.dumps(
|
|
71
|
+
{k: v for k, v in entity.metadata.items() if k != "order"}
|
|
72
|
+
),
|
|
73
|
+
order=order_val,
|
|
67
74
|
vector=embedding,
|
|
68
75
|
)
|
|
69
76
|
|
|
@@ -90,11 +97,13 @@ class ChunkRepository:
|
|
|
90
97
|
return None
|
|
91
98
|
|
|
92
99
|
chunk_record = results[0]
|
|
100
|
+
md = json.loads(chunk_record.metadata)
|
|
93
101
|
return Chunk(
|
|
94
102
|
id=chunk_record.id,
|
|
95
103
|
document_id=chunk_record.document_id,
|
|
96
104
|
content=chunk_record.content,
|
|
97
|
-
metadata=
|
|
105
|
+
metadata=md,
|
|
106
|
+
order=chunk_record.order,
|
|
98
107
|
)
|
|
99
108
|
|
|
100
109
|
async def update(self, entity: Chunk) -> Chunk:
|
|
@@ -102,13 +111,17 @@ class ChunkRepository:
|
|
|
102
111
|
assert entity.id, "Chunk ID is required for update"
|
|
103
112
|
|
|
104
113
|
embedding = await self.embedder.embed(entity.content)
|
|
114
|
+
order_val = int(entity.order)
|
|
105
115
|
|
|
106
116
|
self.store.chunks_table.update(
|
|
107
117
|
where=f"id = '{entity.id}'",
|
|
108
118
|
values={
|
|
109
119
|
"document_id": entity.document_id,
|
|
110
120
|
"content": entity.content,
|
|
111
|
-
"metadata": json.dumps(
|
|
121
|
+
"metadata": json.dumps(
|
|
122
|
+
{k: v for k, v in entity.metadata.items() if k != "order"}
|
|
123
|
+
),
|
|
124
|
+
"order": order_val,
|
|
112
125
|
"vector": embedding,
|
|
113
126
|
},
|
|
114
127
|
)
|
|
@@ -140,15 +153,19 @@ class ChunkRepository:
|
|
|
140
153
|
|
|
141
154
|
results = list(query.to_pydantic(self.store.ChunkRecord))
|
|
142
155
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
156
|
+
chunks: list[Chunk] = []
|
|
157
|
+
for rec in results:
|
|
158
|
+
md = json.loads(rec.metadata)
|
|
159
|
+
chunks.append(
|
|
160
|
+
Chunk(
|
|
161
|
+
id=rec.id,
|
|
162
|
+
document_id=rec.document_id,
|
|
163
|
+
content=rec.content,
|
|
164
|
+
metadata=md,
|
|
165
|
+
order=rec.order,
|
|
166
|
+
)
|
|
149
167
|
)
|
|
150
|
-
|
|
151
|
-
]
|
|
168
|
+
return chunks
|
|
152
169
|
|
|
153
170
|
async def create_chunks_for_document(
|
|
154
171
|
self, document_id: str, document: DoclingDocument
|
|
@@ -191,7 +208,8 @@ class ChunkRepository:
|
|
|
191
208
|
id=chunk_id,
|
|
192
209
|
document_id=document_id,
|
|
193
210
|
content=chunk_text,
|
|
194
|
-
metadata=json.dumps({
|
|
211
|
+
metadata=json.dumps({}),
|
|
212
|
+
order=order,
|
|
195
213
|
vector=embedding,
|
|
196
214
|
)
|
|
197
215
|
chunk_records.append(chunk_record)
|
|
@@ -200,7 +218,8 @@ class ChunkRepository:
|
|
|
200
218
|
id=chunk_id,
|
|
201
219
|
document_id=document_id,
|
|
202
220
|
content=chunk_text,
|
|
203
|
-
metadata={
|
|
221
|
+
metadata={},
|
|
222
|
+
order=order,
|
|
204
223
|
)
|
|
205
224
|
created_chunks.append(chunk)
|
|
206
225
|
|
|
@@ -219,8 +238,10 @@ class ChunkRepository:
|
|
|
219
238
|
self.store.chunks_table = self.store.db.create_table(
|
|
220
239
|
"chunks", schema=self.store.ChunkRecord
|
|
221
240
|
)
|
|
222
|
-
# Create FTS index on the new table
|
|
223
|
-
self.store.chunks_table.create_fts_index(
|
|
241
|
+
# Create FTS index on the new table with phrase query support
|
|
242
|
+
self.store.chunks_table.create_fts_index(
|
|
243
|
+
"content", replace=True, with_position=True, remove_stop_words=False
|
|
244
|
+
)
|
|
224
245
|
|
|
225
246
|
async def delete_by_document_id(self, document_id: str) -> bool:
|
|
226
247
|
"""Delete all chunks for a document."""
|
|
@@ -298,37 +319,36 @@ class ChunkRepository:
|
|
|
298
319
|
doc_uri = doc_results[0].uri if doc_results else None
|
|
299
320
|
doc_meta = doc_results[0].metadata if doc_results else "{}"
|
|
300
321
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
322
|
+
chunks: list[Chunk] = []
|
|
323
|
+
for rec in results:
|
|
324
|
+
md = json.loads(rec.metadata)
|
|
325
|
+
chunks.append(
|
|
326
|
+
Chunk(
|
|
327
|
+
id=rec.id,
|
|
328
|
+
document_id=rec.document_id,
|
|
329
|
+
content=rec.content,
|
|
330
|
+
metadata=md,
|
|
331
|
+
order=rec.order,
|
|
332
|
+
document_uri=doc_uri,
|
|
333
|
+
document_meta=json.loads(doc_meta),
|
|
334
|
+
)
|
|
310
335
|
)
|
|
311
|
-
for chunk in results
|
|
312
|
-
]
|
|
313
336
|
|
|
314
|
-
chunks.sort(key=lambda c: c.
|
|
337
|
+
chunks.sort(key=lambda c: c.order)
|
|
315
338
|
return chunks
|
|
316
339
|
|
|
317
340
|
async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
|
|
318
341
|
"""Get adjacent chunks before and after the given chunk within the same document."""
|
|
319
342
|
assert chunk.document_id, "Document id is required for adjacent chunk finding"
|
|
320
343
|
|
|
321
|
-
chunk_order = chunk.
|
|
322
|
-
if chunk_order is None:
|
|
323
|
-
return []
|
|
344
|
+
chunk_order = chunk.order
|
|
324
345
|
|
|
325
|
-
#
|
|
346
|
+
# Fetch chunks for the same document and filter by order proximity
|
|
326
347
|
all_chunks = await self.get_by_document_id(chunk.document_id)
|
|
327
348
|
|
|
328
|
-
|
|
329
|
-
adjacent_chunks = []
|
|
349
|
+
adjacent_chunks: list[Chunk] = []
|
|
330
350
|
for c in all_chunks:
|
|
331
|
-
c_order = c.
|
|
351
|
+
c_order = c.order
|
|
332
352
|
if c.id != chunk.id and abs(c_order - chunk_order) <= num_adjacent:
|
|
333
353
|
adjacent_chunks.append(c)
|
|
334
354
|
|
|
@@ -380,15 +400,16 @@ class ChunkRepository:
|
|
|
380
400
|
doc_uri = doc.uri if doc else None
|
|
381
401
|
doc_meta = doc.metadata if doc else "{}"
|
|
382
402
|
|
|
403
|
+
md = json.loads(chunk_record.metadata)
|
|
404
|
+
|
|
383
405
|
chunk = Chunk(
|
|
384
406
|
id=chunk_record.id,
|
|
385
407
|
document_id=chunk_record.document_id,
|
|
386
408
|
content=chunk_record.content,
|
|
387
|
-
metadata=
|
|
388
|
-
|
|
389
|
-
else {},
|
|
409
|
+
metadata=md,
|
|
410
|
+
order=chunk_record.order,
|
|
390
411
|
document_uri=doc_uri,
|
|
391
|
-
document_meta=json.loads(doc_meta)
|
|
412
|
+
document_meta=json.loads(doc_meta),
|
|
392
413
|
)
|
|
393
414
|
|
|
394
415
|
# Get score from arrow result
|
|
@@ -34,7 +34,7 @@ class DocumentRepository:
|
|
|
34
34
|
id=record.id,
|
|
35
35
|
content=record.content,
|
|
36
36
|
uri=record.uri,
|
|
37
|
-
metadata=json.loads(record.metadata)
|
|
37
|
+
metadata=json.loads(record.metadata),
|
|
38
38
|
created_at=datetime.fromisoformat(record.created_at)
|
|
39
39
|
if record.created_at
|
|
40
40
|
else datetime.now(),
|
|
@@ -194,7 +194,7 @@ class DocumentRepository:
|
|
|
194
194
|
)
|
|
195
195
|
for order, chunk in enumerate(chunks):
|
|
196
196
|
chunk.document_id = created_doc.id
|
|
197
|
-
chunk.
|
|
197
|
+
chunk.order = order
|
|
198
198
|
await self.chunk_repository.create(chunk)
|
|
199
199
|
|
|
200
200
|
return created_doc
|
|
@@ -84,11 +84,18 @@ class SettingsRepository:
|
|
|
84
84
|
)
|
|
85
85
|
|
|
86
86
|
if existing:
|
|
87
|
-
#
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
87
|
+
# Preserve existing version if present to avoid interfering with upgrade flow
|
|
88
|
+
try:
|
|
89
|
+
existing_settings = (
|
|
90
|
+
json.loads(existing[0].settings) if existing[0].settings else {}
|
|
91
|
+
)
|
|
92
|
+
except Exception:
|
|
93
|
+
existing_settings = {}
|
|
94
|
+
if "version" in existing_settings:
|
|
95
|
+
current_config["version"] = existing_settings["version"]
|
|
96
|
+
|
|
97
|
+
# Update existing settings
|
|
98
|
+
if existing_settings != current_config:
|
|
92
99
|
self.store.settings_table.update(
|
|
93
100
|
where="id = 'settings'",
|
|
94
101
|
values={"settings": json.dumps(current_config)},
|