biblicus 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +2 -2
- biblicus/_vendor/dotyaml/loader.py +40 -1
- biblicus/ai/__init__.py +39 -0
- biblicus/ai/embeddings.py +114 -0
- biblicus/ai/llm.py +138 -0
- biblicus/ai/models.py +226 -0
- biblicus/analysis/__init__.py +5 -2
- biblicus/analysis/markov.py +1624 -0
- biblicus/analysis/models.py +754 -1
- biblicus/analysis/topic_modeling.py +98 -19
- biblicus/backends/sqlite_full_text_search.py +4 -2
- biblicus/cli.py +118 -23
- biblicus/recipes.py +136 -0
- biblicus/text/__init__.py +43 -0
- biblicus/text/annotate.py +222 -0
- biblicus/text/extract.py +210 -0
- biblicus/text/link.py +519 -0
- biblicus/text/markup.py +200 -0
- biblicus/text/models.py +319 -0
- biblicus/text/prompts.py +113 -0
- biblicus/text/redact.py +229 -0
- biblicus/text/slice.py +155 -0
- biblicus/text/tool_loop.py +334 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/METADATA +88 -25
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/RECORD +30 -15
- biblicus/analysis/llm.py +0 -106
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/WHEEL +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared tool loop for virtual file edit workflows.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence
|
|
10
|
+
|
|
11
|
+
from ..ai.llm import chat_completion
|
|
12
|
+
from ..ai.models import LlmClientConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ToolLoopResult:
|
|
17
|
+
"""
|
|
18
|
+
Tool loop result payload.
|
|
19
|
+
|
|
20
|
+
:param text: Final text after tool edits.
|
|
21
|
+
:type text: str
|
|
22
|
+
:param done: Whether the model called done.
|
|
23
|
+
:type done: bool
|
|
24
|
+
:param last_error: Last error message, if any.
|
|
25
|
+
:type last_error: str or None
|
|
26
|
+
:param messages: Conversation history including tool calls/results.
|
|
27
|
+
:type messages: list[dict[str, Any]]
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
text: str
|
|
31
|
+
done: bool
|
|
32
|
+
last_error: Optional[str]
|
|
33
|
+
messages: List[Dict[str, Any]]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run_tool_loop(
|
|
37
|
+
*,
|
|
38
|
+
text: str,
|
|
39
|
+
client: LlmClientConfig,
|
|
40
|
+
system_prompt: str,
|
|
41
|
+
prompt_template: str,
|
|
42
|
+
max_rounds: int,
|
|
43
|
+
max_edits_per_round: int,
|
|
44
|
+
apply_str_replace: Callable[[str, str, str], str],
|
|
45
|
+
validate_text: Optional[Callable[[str], Sequence[str]]] = None,
|
|
46
|
+
build_retry_message: Optional[Callable[[Sequence[str], str], str]] = None,
|
|
47
|
+
messages: Optional[List[Dict[str, Any]]] = None,
|
|
48
|
+
) -> ToolLoopResult:
|
|
49
|
+
"""
|
|
50
|
+
Run a tool-driven virtual file edit loop.
|
|
51
|
+
|
|
52
|
+
:param text: Input text to edit.
|
|
53
|
+
:type text: str
|
|
54
|
+
:param client: LLM client configuration.
|
|
55
|
+
:type client: biblicus.ai.models.LlmClientConfig
|
|
56
|
+
:param system_prompt: System prompt containing the text placeholder.
|
|
57
|
+
:type system_prompt: str
|
|
58
|
+
:param prompt_template: User prompt describing what to return.
|
|
59
|
+
:type prompt_template: str
|
|
60
|
+
:param max_rounds: Maximum number of rounds.
|
|
61
|
+
:type max_rounds: int
|
|
62
|
+
:param max_edits_per_round: Maximum edits per round.
|
|
63
|
+
:type max_edits_per_round: int
|
|
64
|
+
:param apply_str_replace: Replacement function for str_replace edits.
|
|
65
|
+
:type apply_str_replace: Callable[[str, str, str], str]
|
|
66
|
+
:param validate_text: Optional validation callback returning error messages.
|
|
67
|
+
:type validate_text: Callable[[str], Sequence[str]] or None
|
|
68
|
+
:param build_retry_message: Optional retry message builder.
|
|
69
|
+
:type build_retry_message: Callable[[Sequence[str], str], str] or None
|
|
70
|
+
:param messages: Optional conversation history to continue (system prompt should already be included).
|
|
71
|
+
:type messages: list[dict[str, Any]] or None
|
|
72
|
+
:return: Tool loop result.
|
|
73
|
+
:rtype: ToolLoopResult
|
|
74
|
+
:raises ValueError: If the provider backend is unavailable.
|
|
75
|
+
|
|
76
|
+
Validation errors trigger a retry by appending a user feedback message to the
|
|
77
|
+
conversation history (including all prior tool calls and tool results).
|
|
78
|
+
"""
|
|
79
|
+
tools = [
|
|
80
|
+
{
|
|
81
|
+
"type": "function",
|
|
82
|
+
"function": {
|
|
83
|
+
"name": "str_replace",
|
|
84
|
+
"description": "Replace an exact substring with a new string.",
|
|
85
|
+
"parameters": {
|
|
86
|
+
"type": "object",
|
|
87
|
+
"properties": {
|
|
88
|
+
"old_str": {"type": "string"},
|
|
89
|
+
"new_str": {"type": "string"},
|
|
90
|
+
},
|
|
91
|
+
"required": ["old_str", "new_str"],
|
|
92
|
+
"additionalProperties": False,
|
|
93
|
+
},
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"type": "function",
|
|
98
|
+
"function": {
|
|
99
|
+
"name": "view",
|
|
100
|
+
"description": "Return the current text.",
|
|
101
|
+
"parameters": {"type": "object", "properties": {}, "additionalProperties": False},
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
"type": "function",
|
|
106
|
+
"function": {
|
|
107
|
+
"name": "done",
|
|
108
|
+
"description": "Finish editing.",
|
|
109
|
+
"parameters": {"type": "object", "properties": {}, "additionalProperties": False},
|
|
110
|
+
},
|
|
111
|
+
},
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
if messages is None:
|
|
115
|
+
rendered_prompt = _render_template(
|
|
116
|
+
prompt_template,
|
|
117
|
+
text=text,
|
|
118
|
+
text_length=len(text),
|
|
119
|
+
error="",
|
|
120
|
+
)
|
|
121
|
+
rendered_system = _render_template(
|
|
122
|
+
system_prompt,
|
|
123
|
+
text=text,
|
|
124
|
+
text_length=len(text),
|
|
125
|
+
error="",
|
|
126
|
+
)
|
|
127
|
+
messages = [
|
|
128
|
+
{"role": "system", "content": rendered_system},
|
|
129
|
+
{"role": "user", "content": rendered_prompt},
|
|
130
|
+
]
|
|
131
|
+
else:
|
|
132
|
+
messages = list(messages)
|
|
133
|
+
|
|
134
|
+
done = False
|
|
135
|
+
last_error: Optional[str] = None
|
|
136
|
+
current_text = text
|
|
137
|
+
|
|
138
|
+
for _ in range(max_rounds):
|
|
139
|
+
had_tool_error = False
|
|
140
|
+
response = chat_completion(
|
|
141
|
+
client=client,
|
|
142
|
+
messages=messages,
|
|
143
|
+
tools=tools,
|
|
144
|
+
tool_choice="auto",
|
|
145
|
+
)
|
|
146
|
+
tool_calls = response.tool_calls
|
|
147
|
+
if not tool_calls:
|
|
148
|
+
content = response.text or ""
|
|
149
|
+
last_error = "Tool loop requires tool calls (str_replace/view/done)"
|
|
150
|
+
messages.append({"role": "assistant", "content": content})
|
|
151
|
+
messages.append(
|
|
152
|
+
{
|
|
153
|
+
"role": "user",
|
|
154
|
+
"content": _build_no_tool_calls_message(
|
|
155
|
+
assistant_message=content,
|
|
156
|
+
current_text=current_text,
|
|
157
|
+
),
|
|
158
|
+
}
|
|
159
|
+
)
|
|
160
|
+
continue
|
|
161
|
+
messages.append(
|
|
162
|
+
{
|
|
163
|
+
"role": "assistant",
|
|
164
|
+
"content": response.text or "",
|
|
165
|
+
"tool_calls": list(tool_calls),
|
|
166
|
+
}
|
|
167
|
+
)
|
|
168
|
+
edit_count = 0
|
|
169
|
+
for tool_call in tool_calls:
|
|
170
|
+
function = tool_call.get("function", {})
|
|
171
|
+
name = str(function.get("name") or "")
|
|
172
|
+
args = json.loads(str(function.get("arguments") or "{}"))
|
|
173
|
+
if name == "str_replace":
|
|
174
|
+
edit_count += 1
|
|
175
|
+
if edit_count > max_edits_per_round:
|
|
176
|
+
last_error = "Tool loop exceeded max edits per round"
|
|
177
|
+
tool_result = f"Error: {last_error}"
|
|
178
|
+
else:
|
|
179
|
+
old_str = str(args.get("old_str", ""))
|
|
180
|
+
new_str = str(args.get("new_str", ""))
|
|
181
|
+
if not old_str or not new_str:
|
|
182
|
+
last_error = "Tool loop requires non-empty old_str and new_str"
|
|
183
|
+
tool_result = f"Error: {last_error}"
|
|
184
|
+
else:
|
|
185
|
+
try:
|
|
186
|
+
current_text = apply_str_replace(current_text, old_str, new_str)
|
|
187
|
+
tool_result = (
|
|
188
|
+
"Applied str_replace.\nCurrent text:\n---\n" f"{current_text}\n---"
|
|
189
|
+
)
|
|
190
|
+
last_error = None
|
|
191
|
+
except ValueError as exc:
|
|
192
|
+
last_error = str(exc)
|
|
193
|
+
tool_result = f"Error: {last_error}"
|
|
194
|
+
had_tool_error = True
|
|
195
|
+
elif name == "view":
|
|
196
|
+
tool_result = f"Current text:\n---\n{current_text}\n---"
|
|
197
|
+
elif name == "done":
|
|
198
|
+
done = True
|
|
199
|
+
tool_result = "Done"
|
|
200
|
+
else:
|
|
201
|
+
raise ValueError(f"Tool loop received unknown tool: {name}")
|
|
202
|
+
messages.append(
|
|
203
|
+
{
|
|
204
|
+
"role": "tool",
|
|
205
|
+
"tool_call_id": tool_call.get("id", ""),
|
|
206
|
+
"content": tool_result,
|
|
207
|
+
}
|
|
208
|
+
)
|
|
209
|
+
if had_tool_error and last_error is not None:
|
|
210
|
+
done = False
|
|
211
|
+
messages.append(
|
|
212
|
+
{
|
|
213
|
+
"role": "user",
|
|
214
|
+
"content": _build_tool_error_message(
|
|
215
|
+
error_message=last_error,
|
|
216
|
+
current_text=current_text,
|
|
217
|
+
),
|
|
218
|
+
}
|
|
219
|
+
)
|
|
220
|
+
continue
|
|
221
|
+
if validate_text is not None:
|
|
222
|
+
validation_errors = list(validate_text(current_text))
|
|
223
|
+
if validation_errors:
|
|
224
|
+
last_error = "; ".join(validation_errors)
|
|
225
|
+
done = False
|
|
226
|
+
retry_message = _build_retry_message(
|
|
227
|
+
validation_errors=validation_errors,
|
|
228
|
+
current_text=current_text,
|
|
229
|
+
build_retry_message=build_retry_message,
|
|
230
|
+
)
|
|
231
|
+
messages.append({"role": "user", "content": retry_message})
|
|
232
|
+
continue
|
|
233
|
+
if done:
|
|
234
|
+
break
|
|
235
|
+
|
|
236
|
+
return ToolLoopResult(
|
|
237
|
+
text=current_text,
|
|
238
|
+
done=done,
|
|
239
|
+
last_error=last_error,
|
|
240
|
+
messages=messages,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _build_retry_message(
|
|
245
|
+
*,
|
|
246
|
+
validation_errors: Sequence[str],
|
|
247
|
+
current_text: str,
|
|
248
|
+
build_retry_message: Optional[Callable[[Sequence[str], str], str]],
|
|
249
|
+
) -> str:
|
|
250
|
+
if build_retry_message is not None:
|
|
251
|
+
return build_retry_message(validation_errors, current_text)
|
|
252
|
+
error_lines = "\n".join(f"- {error}" for error in validation_errors)
|
|
253
|
+
return (
|
|
254
|
+
"Your last edit did not validate.\n"
|
|
255
|
+
"Issues:\n"
|
|
256
|
+
f"{error_lines}\n\n"
|
|
257
|
+
"Please fix the markup using str_replace and keep the source text unchanged.\n"
|
|
258
|
+
"Current text:\n"
|
|
259
|
+
f"---\n{current_text}\n---"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _build_tool_error_message(*, error_message: str, current_text: str) -> str:
|
|
264
|
+
if "not unique" in error_message:
|
|
265
|
+
guidance = (
|
|
266
|
+
"Use a longer unique old_str by including surrounding words or punctuation "
|
|
267
|
+
"so it matches exactly once."
|
|
268
|
+
)
|
|
269
|
+
elif "not found" in error_message:
|
|
270
|
+
guidance = (
|
|
271
|
+
"Copy the exact old_str from the current text (including punctuation/case) "
|
|
272
|
+
"or call view to inspect the latest text."
|
|
273
|
+
)
|
|
274
|
+
else:
|
|
275
|
+
guidance = "Fix the tool call and try again."
|
|
276
|
+
return (
|
|
277
|
+
"Your last tool call failed.\n"
|
|
278
|
+
f"Error: {error_message}\n"
|
|
279
|
+
f"{guidance}\n"
|
|
280
|
+
"Current text:\n"
|
|
281
|
+
f"---\n{current_text}\n---"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _build_no_tool_calls_message(*, assistant_message: str, current_text: str) -> str:
|
|
286
|
+
guidance = (
|
|
287
|
+
"Use the tools to edit the text. "
|
|
288
|
+
"Call str_replace to insert markup, view to inspect, and done when finished."
|
|
289
|
+
)
|
|
290
|
+
message = "Your last response did not include any tool calls."
|
|
291
|
+
if assistant_message.strip():
|
|
292
|
+
message = f"{message}\nAssistant message: {assistant_message}"
|
|
293
|
+
return f"{message}\n" f"{guidance}\n" "Current text:\n" f"---\n{current_text}\n---"
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def request_confirmation(
|
|
297
|
+
*,
|
|
298
|
+
result: ToolLoopResult,
|
|
299
|
+
text: str,
|
|
300
|
+
client: LlmClientConfig,
|
|
301
|
+
system_prompt: str,
|
|
302
|
+
prompt_template: str,
|
|
303
|
+
max_rounds: int,
|
|
304
|
+
max_edits_per_round: int,
|
|
305
|
+
apply_str_replace: Callable[[str, str, str], str],
|
|
306
|
+
confirmation_message: str,
|
|
307
|
+
validate_text: Optional[Callable[[str], Sequence[str]]] = None,
|
|
308
|
+
build_retry_message: Optional[Callable[[Sequence[str], str], str]] = None,
|
|
309
|
+
) -> ToolLoopResult:
|
|
310
|
+
"""
|
|
311
|
+
Continue a tool loop with a confirmation message appended to the conversation history.
|
|
312
|
+
|
|
313
|
+
This preserves the model's prior tool calls and the current text state while giving it
|
|
314
|
+
a chance to confirm an empty/ambiguous result.
|
|
315
|
+
"""
|
|
316
|
+
messages = list(result.messages)
|
|
317
|
+
messages.append({"role": "user", "content": confirmation_message})
|
|
318
|
+
return run_tool_loop(
|
|
319
|
+
text=text,
|
|
320
|
+
client=client,
|
|
321
|
+
system_prompt=system_prompt,
|
|
322
|
+
prompt_template=prompt_template,
|
|
323
|
+
max_rounds=max_rounds,
|
|
324
|
+
max_edits_per_round=max_edits_per_round,
|
|
325
|
+
apply_str_replace=apply_str_replace,
|
|
326
|
+
validate_text=validate_text,
|
|
327
|
+
build_retry_message=build_retry_message,
|
|
328
|
+
messages=messages,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _render_template(template: str, *, text: str, text_length: int, error: str) -> str:
|
|
333
|
+
rendered = template.replace("{text_length}", str(text_length)).replace("{error}", error)
|
|
334
|
+
return rendered.replace("{text}", text)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -9,6 +9,8 @@ License-File: LICENSE
|
|
|
9
9
|
Requires-Dist: pydantic>=2.0
|
|
10
10
|
Requires-Dist: PyYAML>=6.0
|
|
11
11
|
Requires-Dist: pypdf>=4.0
|
|
12
|
+
Requires-Dist: Jinja2>=3.1
|
|
13
|
+
Requires-Dist: dotyaml>=0.1.3
|
|
12
14
|
Provides-Extra: dev
|
|
13
15
|
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
14
16
|
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
@@ -18,6 +20,9 @@ Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
|
|
|
18
20
|
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
19
21
|
Requires-Dist: black>=24.0; extra == "dev"
|
|
20
22
|
Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
|
|
23
|
+
Provides-Extra: dspy
|
|
24
|
+
Requires-Dist: dspy>=2.5; extra == "dspy"
|
|
25
|
+
Requires-Dist: litellm>=1.0; extra == "dspy"
|
|
21
26
|
Provides-Extra: openai
|
|
22
27
|
Requires-Dist: openai>=1.0; extra == "openai"
|
|
23
28
|
Provides-Extra: unstructured
|
|
@@ -40,6 +45,8 @@ Provides-Extra: docling-mlx
|
|
|
40
45
|
Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
|
|
41
46
|
Provides-Extra: topic-modeling
|
|
42
47
|
Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
|
|
48
|
+
Provides-Extra: markov-analysis
|
|
49
|
+
Requires-Dist: hmmlearn>=0.3.0; extra == "markov-analysis"
|
|
43
50
|
Provides-Extra: datasets
|
|
44
51
|
Requires-Dist: datasets>=2.18.0; extra == "datasets"
|
|
45
52
|
Dynamic: license-file
|
|
@@ -56,12 +63,20 @@ If you are building an assistant in Python, you probably have material you want
|
|
|
56
63
|
|
|
57
64
|
The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
|
|
58
65
|
|
|
59
|
-
|
|
66
|
+
Biblicus gives you a normal folder on disk to manage. In Biblicus documentation, that managed folder is called a *corpus* (plural: *corpora*). It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw files.
|
|
60
67
|
|
|
61
68
|
It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or your own setup. Use it from Python or from the command line interface.
|
|
62
69
|
|
|
63
70
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
64
71
|
|
|
72
|
+
## Analysis highlights
|
|
73
|
+
|
|
74
|
+
- `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
|
|
75
|
+
- YAML recipes support cascading composition plus dotted `--config key=value` overrides.
|
|
76
|
+
- Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
|
|
77
|
+
- See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
|
|
78
|
+
- See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
|
|
79
|
+
|
|
65
80
|
## Start with a knowledge base
|
|
66
81
|
|
|
67
82
|
If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
|
|
@@ -106,7 +121,7 @@ Think in three stages.
|
|
|
106
121
|
|
|
107
122
|
If you learn a few project words, the rest of the system becomes predictable.
|
|
108
123
|
|
|
109
|
-
- Corpus is the folder that holds raw items and their metadata.
|
|
124
|
+
- Corpus is the managed folder that holds raw items and their metadata.
|
|
110
125
|
- Item is the raw bytes plus optional metadata and source information.
|
|
111
126
|
- Catalog is the rebuildable index of the corpus.
|
|
112
127
|
- Extraction run is a recorded extraction build that produces text artifacts.
|
|
@@ -161,28 +176,28 @@ sequenceDiagram
|
|
|
161
176
|
This repository is a working Python package. Install it into a virtual environment from the repository root.
|
|
162
177
|
|
|
163
178
|
```
|
|
164
|
-
|
|
179
|
+
python -m pip install -e .
|
|
165
180
|
```
|
|
166
181
|
|
|
167
182
|
After the first release, you can install it from Python Package Index.
|
|
168
183
|
|
|
169
184
|
```
|
|
170
|
-
|
|
185
|
+
python -m pip install biblicus
|
|
171
186
|
```
|
|
172
187
|
|
|
173
188
|
### Optional extras
|
|
174
189
|
|
|
175
190
|
Some extractors are optional so the base install stays small.
|
|
176
191
|
|
|
177
|
-
- Optical character recognition for images: `
|
|
178
|
-
- Advanced optical character recognition with PaddleOCR: `
|
|
179
|
-
- Document understanding with Docling VLM: `
|
|
180
|
-
- Document understanding with Docling VLM and MLX acceleration: `
|
|
181
|
-
- Speech to text transcription with OpenAI: `
|
|
182
|
-
- Speech to text transcription with Deepgram: `
|
|
183
|
-
- Broad document parsing fallback: `
|
|
184
|
-
- MarkItDown document conversion (requires Python 3.10 or higher): `
|
|
185
|
-
- Topic modeling analysis with BERTopic: `
|
|
192
|
+
- Optical character recognition for images: `python -m pip install "biblicus[ocr]"`
|
|
193
|
+
- Advanced optical character recognition with PaddleOCR: `python -m pip install "biblicus[paddleocr]"`
|
|
194
|
+
- Document understanding with Docling VLM: `python -m pip install "biblicus[docling]"`
|
|
195
|
+
- Document understanding with Docling VLM and MLX acceleration: `python -m pip install "biblicus[docling-mlx]"`
|
|
196
|
+
- Speech to text transcription with OpenAI: `python -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
197
|
+
- Speech to text transcription with Deepgram: `python -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
198
|
+
- Broad document parsing fallback: `python -m pip install "biblicus[unstructured]"`
|
|
199
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python -m pip install "biblicus[markitdown]"`
|
|
200
|
+
- Topic modeling analysis with BERTopic: `python -m pip install "biblicus[topic-modeling]"`
|
|
186
201
|
|
|
187
202
|
## Quick start
|
|
188
203
|
|
|
@@ -200,16 +215,49 @@ biblicus build --corpus corpora/example --backend scan
|
|
|
200
215
|
biblicus query --corpus corpora/example --query "note"
|
|
201
216
|
```
|
|
202
217
|
|
|
203
|
-
|
|
218
|
+
## Web Ingestion
|
|
219
|
+
|
|
220
|
+
Biblicus supports ingesting content directly from the web using two approaches.
|
|
221
|
+
|
|
222
|
+
### Ingest from URLs
|
|
223
|
+
|
|
224
|
+
Ingest individual documents or web pages from URLs. The `ingest` command automatically detects content types including PDF, HTML, Markdown, images, and audio:
|
|
204
225
|
|
|
226
|
+
```bash
|
|
227
|
+
# Ingest a document from a URL
|
|
228
|
+
biblicus ingest https://example.com/document.pdf --tags "research"
|
|
229
|
+
|
|
230
|
+
# Ingest a web page
|
|
231
|
+
biblicus ingest https://example.com/article.html --tags "article"
|
|
232
|
+
|
|
233
|
+
# Ingest with a corpus path specified
|
|
234
|
+
biblicus ingest --corpus corpora/example https://docs.example.com/guide.md --tags "documentation"
|
|
205
235
|
```
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
236
|
+
|
|
237
|
+
### Crawl Websites
|
|
238
|
+
|
|
239
|
+
Crawl entire website sections with automatic link discovery. The crawler follows links within the allowed prefix and stores discovered content:
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
# Crawl a documentation site
|
|
243
|
+
biblicus crawl \
|
|
244
|
+
--corpus corpora/example \
|
|
245
|
+
--root-url https://docs.example.com/ \
|
|
246
|
+
--allowed-prefix https://docs.example.com/ \
|
|
247
|
+
--max-items 100 \
|
|
248
|
+
--tags "documentation"
|
|
249
|
+
|
|
250
|
+
# Crawl a specific blog category
|
|
251
|
+
biblicus crawl \
|
|
252
|
+
--corpus corpora/example \
|
|
253
|
+
--root-url https://blog.example.com/category/tutorials/ \
|
|
254
|
+
--allowed-prefix https://blog.example.com/category/tutorials/ \
|
|
255
|
+
--max-items 50 \
|
|
256
|
+
--tags "tutorials,blog"
|
|
211
257
|
```
|
|
212
258
|
|
|
259
|
+
The `--allowed-prefix` parameter restricts the crawler to only follow links that start with the specified URL prefix, preventing it from crawling outside the intended scope. The crawler respects `.biblicusignore` rules and stores items under `raw/imports/crawl/` in your corpus.
|
|
260
|
+
|
|
213
261
|
## End-to-end example: lower-level control
|
|
214
262
|
|
|
215
263
|
The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
|
|
@@ -540,6 +588,21 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
|
|
|
540
588
|
For extraction evaluation workflows, dataset formats, and report interpretation, see
|
|
541
589
|
`docs/EXTRACTION_EVALUATION.md`.
|
|
542
590
|
|
|
591
|
+
## Text extract utility
|
|
592
|
+
|
|
593
|
+
Text extract is a reusable analysis utility that lets a model insert XML tags into a long text without re-emitting the
|
|
594
|
+
entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
|
|
595
|
+
analysis.
|
|
596
|
+
|
|
597
|
+
See `docs/TEXT_EXTRACT.md` for the utility API and examples, and `docs/MARKOV_ANALYSIS.md` for the Markov integration.
|
|
598
|
+
|
|
599
|
+
## Text slice utility
|
|
600
|
+
|
|
601
|
+
Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
|
|
602
|
+
re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
|
|
603
|
+
|
|
604
|
+
See `docs/TEXT_SLICE.md` for the utility API and examples.
|
|
605
|
+
|
|
543
606
|
## Topic modeling analysis
|
|
544
607
|
|
|
545
608
|
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
@@ -594,7 +657,7 @@ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[t
|
|
|
594
657
|
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
595
658
|
|
|
596
659
|
```
|
|
597
|
-
|
|
660
|
+
python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
598
661
|
```
|
|
599
662
|
|
|
600
663
|
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
@@ -608,13 +671,13 @@ Use `scripts/download_pdf_samples.py` to download a small Portable Document Form
|
|
|
608
671
|
## Tests and coverage
|
|
609
672
|
|
|
610
673
|
```
|
|
611
|
-
|
|
674
|
+
python scripts/test.py
|
|
612
675
|
```
|
|
613
676
|
|
|
614
677
|
To include integration scenarios that download public test data at runtime, run this command.
|
|
615
678
|
|
|
616
679
|
```
|
|
617
|
-
|
|
680
|
+
python scripts/test.py --integration
|
|
618
681
|
```
|
|
619
682
|
|
|
620
683
|
## Releases
|
|
@@ -632,13 +695,13 @@ Reference documentation is generated from Sphinx style docstrings.
|
|
|
632
695
|
Install development dependencies:
|
|
633
696
|
|
|
634
697
|
```
|
|
635
|
-
|
|
698
|
+
python -m pip install -e ".[dev]"
|
|
636
699
|
```
|
|
637
700
|
|
|
638
701
|
Build the documentation:
|
|
639
702
|
|
|
640
703
|
```
|
|
641
|
-
|
|
704
|
+
python -m sphinx -b html docs docs/_build/html
|
|
642
705
|
```
|
|
643
706
|
|
|
644
707
|
## License
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
biblicus/__init__.py,sha256=
|
|
1
|
+
biblicus/__init__.py,sha256=xZ9X4aPjgTWT8Uds5CZ7Y8Tiy9FmryQyO4s4lsF8HWA,496
|
|
2
2
|
biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
|
|
3
|
-
biblicus/cli.py,sha256=
|
|
3
|
+
biblicus/cli.py,sha256=APxBNLztfGTVGgxt5qgCofYti2AsGGeZQ7SB9tkneXQ,41760
|
|
4
4
|
biblicus/constants.py,sha256=gAlEVJhxdFj-eWWJrlYbP7H1X3c5gwhrIBq9NQ1Vq_E,371
|
|
5
5
|
biblicus/context.py,sha256=iXRFGpf_5YDPsDsm_iTK6nCvtUWDoYVI7op-l2QU3uA,10189
|
|
6
6
|
biblicus/corpus.py,sha256=qSDnYJXhWlF2p_BbFLl6xtI53lIIPxwyKLLGLC432Sg,55612
|
|
@@ -18,27 +18,32 @@ biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
|
|
|
18
18
|
biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
|
|
19
19
|
biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
|
|
20
20
|
biblicus/models.py,sha256=r28O6cg3d1bjJnKqpLieVLTgtXTfzb_60wMORvVuDN0,15846
|
|
21
|
+
biblicus/recipes.py,sha256=rqU66QnjOup6O8Y9Yq7XszmpoM0Pyrjw3RrfdnlVqgE,4210
|
|
21
22
|
biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
|
|
22
23
|
biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
|
|
23
24
|
biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
|
|
24
25
|
biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
|
|
25
26
|
biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
|
|
26
|
-
biblicus/_vendor/dotyaml/__init__.py,sha256=
|
|
27
|
+
biblicus/_vendor/dotyaml/__init__.py,sha256=OVv6IsuCvsjaUznLzuit4UbSLVg4TiTVm9cOPY1Y2Cs,409
|
|
27
28
|
biblicus/_vendor/dotyaml/interpolation.py,sha256=FVUkdQr_KbXjoFPvGTv6I5v0X5iZkJe5yhZtYKRbYzI,1991
|
|
28
|
-
biblicus/_vendor/dotyaml/loader.py,sha256=
|
|
29
|
+
biblicus/_vendor/dotyaml/loader.py,sha256=vFfnhbvHYYyOKzl5iq2FH97GSHH2GvEHmGiPnE0g0kA,6954
|
|
29
30
|
biblicus/_vendor/dotyaml/transformer.py,sha256=RWNrm_KAsanG409HEIWquTH9i_jz-ZFK9fM86emXeF4,3724
|
|
30
|
-
biblicus/
|
|
31
|
+
biblicus/ai/__init__.py,sha256=HY8PKhqRLIDYJYlL9A2JjqKxQaujITNLYgIytNUhnrU,1161
|
|
32
|
+
biblicus/ai/embeddings.py,sha256=n2xlonZOHcmDrP1XMhGcja5Hzr8r87PF-IecH-Yhu98,3703
|
|
33
|
+
biblicus/ai/llm.py,sha256=g724_UAxmicB_W-Z7Uu9SRsI9-aVNZUlYIjvnlE17VE,4712
|
|
34
|
+
biblicus/ai/models.py,sha256=6newnT0NJf3uf9FvWXVC-9Gkk5xRB-PjXDZpeBHA04Y,7857
|
|
35
|
+
biblicus/analysis/__init__.py,sha256=I4LqxfKPKF3DEVmAyagQ8J1RN-ia3fyfKJ9frCllZQE,1385
|
|
31
36
|
biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
|
|
32
|
-
biblicus/analysis/
|
|
33
|
-
biblicus/analysis/models.py,sha256=
|
|
37
|
+
biblicus/analysis/markov.py,sha256=XiNbfg8lmKWXCzQJhZtZxghT7gBPh2o6xA3D1-tMQqs,61762
|
|
38
|
+
biblicus/analysis/models.py,sha256=FnUAO6n1yjagYlhe2ocRtfb3IZT3W03rrEZ9LdCx7Kc,56214
|
|
34
39
|
biblicus/analysis/profiling.py,sha256=v2B4Tn9WiXRRP_wIADBPRQVKkMc92KXCas7OBa7n0LU,10670
|
|
35
40
|
biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
|
|
36
|
-
biblicus/analysis/topic_modeling.py,sha256=
|
|
41
|
+
biblicus/analysis/topic_modeling.py,sha256=mNBiRMpY5Jtyz8Aj-WXYY8guEghx9jozTfgveinJLoc,22135
|
|
37
42
|
biblicus/backends/__init__.py,sha256=3HJY0oMm8pFFVGC4Z-dlPRHhIPVDdUzsa4IMjKP_9dI,1378
|
|
38
43
|
biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
|
|
39
44
|
biblicus/backends/hybrid.py,sha256=FjCzrXdicbM-qVDM-_AHYTT8Gzvem1eZJaQUVKk_tMg,10753
|
|
40
45
|
biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
|
|
41
|
-
biblicus/backends/sqlite_full_text_search.py,sha256=
|
|
46
|
+
biblicus/backends/sqlite_full_text_search.py,sha256=z0WsURTj4w8Y6SPa6k4bV9wdze-HwNWgxj5qcab0TUE,24480
|
|
42
47
|
biblicus/backends/vector.py,sha256=3RdxSBPb1kOX4Sfd4d1qXFW9ecuiRvGpOHadLCbeh1g,15183
|
|
43
48
|
biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
|
|
44
49
|
biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
|
|
@@ -58,9 +63,19 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
|
|
|
58
63
|
biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
|
|
59
64
|
biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
|
|
60
65
|
biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
|
|
61
|
-
biblicus
|
|
62
|
-
biblicus
|
|
63
|
-
biblicus
|
|
64
|
-
biblicus
|
|
65
|
-
biblicus
|
|
66
|
-
biblicus
|
|
66
|
+
biblicus/text/__init__.py,sha256=MiaGAY7xWlUCeBzDzNz6pJnSMiU_Ge5EmlSiEzhqTRo,947
|
|
67
|
+
biblicus/text/annotate.py,sha256=asmpj3_s_t8hl6stEg99apmqxAhDTkoPzHhZNggYE3Y,8355
|
|
68
|
+
biblicus/text/extract.py,sha256=pdnUiZWtfCUj7kZK5zhd-tjqokgmhYYheWhyN3iShRU,7669
|
|
69
|
+
biblicus/text/link.py,sha256=Xl0yxD1rvbRJRVdWG_ZP6zgmbpgWSJYcUcNM06-OUWU,20077
|
|
70
|
+
biblicus/text/markup.py,sha256=8jj9aX03HiZTOWdPs_VC4JLpQ7TlPHgGuXj_QUQIHVw,6265
|
|
71
|
+
biblicus/text/models.py,sha256=REp6RowUWFdV-6y437JENP7XtGKt57BOvVtF91KmUqI,10853
|
|
72
|
+
biblicus/text/prompts.py,sha256=Z5fSsy1Xzr0rCI0WZ3djiEQlbRDncyNBQ7_ZoWVPL4g,6704
|
|
73
|
+
biblicus/text/redact.py,sha256=tkDRmA0VvOZwMryEmBPLEHf3Z6VHJkkaWjBaNIMyGZ0,8415
|
|
74
|
+
biblicus/text/slice.py,sha256=dlHxGO8c5P8BszXGwlNQoQ-cyWjJf6PfS1LUBJXXGEE,5762
|
|
75
|
+
biblicus/text/tool_loop.py,sha256=w1PGLBvIemOdi6l0ArdYDVL7zgx-RC76bBOO0PKqpt0,11831
|
|
76
|
+
biblicus-0.15.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
|
|
77
|
+
biblicus-0.15.0.dist-info/METADATA,sha256=z2HMoYpVwMl30DhbI00Jp1QV4YH680Vz_4GQF3uM_3o,30782
|
|
78
|
+
biblicus-0.15.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
79
|
+
biblicus-0.15.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
|
|
80
|
+
biblicus-0.15.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
|
|
81
|
+
biblicus-0.15.0.dist-info/RECORD,,
|