biblicus 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,334 @@
1
+ """
2
+ Shared tool loop for virtual file edit workflows.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from dataclasses import dataclass
9
+ from typing import Any, Callable, Dict, List, Optional, Sequence
10
+
11
+ from ..ai.llm import chat_completion
12
+ from ..ai.models import LlmClientConfig
13
+
14
+
15
+ @dataclass
16
+ class ToolLoopResult:
17
+ """
18
+ Tool loop result payload.
19
+
20
+ :param text: Final text after tool edits.
21
+ :type text: str
22
+ :param done: Whether the model called done.
23
+ :type done: bool
24
+ :param last_error: Last error message, if any.
25
+ :type last_error: str or None
26
+ :param messages: Conversation history including tool calls/results.
27
+ :type messages: list[dict[str, Any]]
28
+ """
29
+
30
+ text: str
31
+ done: bool
32
+ last_error: Optional[str]
33
+ messages: List[Dict[str, Any]]
34
+
35
+
36
+ def run_tool_loop(
37
+ *,
38
+ text: str,
39
+ client: LlmClientConfig,
40
+ system_prompt: str,
41
+ prompt_template: str,
42
+ max_rounds: int,
43
+ max_edits_per_round: int,
44
+ apply_str_replace: Callable[[str, str, str], str],
45
+ validate_text: Optional[Callable[[str], Sequence[str]]] = None,
46
+ build_retry_message: Optional[Callable[[Sequence[str], str], str]] = None,
47
+ messages: Optional[List[Dict[str, Any]]] = None,
48
+ ) -> ToolLoopResult:
49
+ """
50
+ Run a tool-driven virtual file edit loop.
51
+
52
+ :param text: Input text to edit.
53
+ :type text: str
54
+ :param client: LLM client configuration.
55
+ :type client: biblicus.ai.models.LlmClientConfig
56
+ :param system_prompt: System prompt containing the text placeholder.
57
+ :type system_prompt: str
58
+ :param prompt_template: User prompt describing what to return.
59
+ :type prompt_template: str
60
+ :param max_rounds: Maximum number of rounds.
61
+ :type max_rounds: int
62
+ :param max_edits_per_round: Maximum edits per round.
63
+ :type max_edits_per_round: int
64
+ :param apply_str_replace: Replacement function for str_replace edits.
65
+ :type apply_str_replace: Callable[[str, str, str], str]
66
+ :param validate_text: Optional validation callback returning error messages.
67
+ :type validate_text: Callable[[str], Sequence[str]] or None
68
+ :param build_retry_message: Optional retry message builder.
69
+ :type build_retry_message: Callable[[Sequence[str], str], str] or None
70
+ :param messages: Optional conversation history to continue (system prompt should already be included).
71
+ :type messages: list[dict[str, Any]] or None
72
+ :return: Tool loop result.
73
+ :rtype: ToolLoopResult
74
+ :raises ValueError: If the provider backend is unavailable.
75
+
76
+ Validation errors trigger a retry by appending a user feedback message to the
77
+ conversation history (including all prior tool calls and tool results).
78
+ """
79
+ tools = [
80
+ {
81
+ "type": "function",
82
+ "function": {
83
+ "name": "str_replace",
84
+ "description": "Replace an exact substring with a new string.",
85
+ "parameters": {
86
+ "type": "object",
87
+ "properties": {
88
+ "old_str": {"type": "string"},
89
+ "new_str": {"type": "string"},
90
+ },
91
+ "required": ["old_str", "new_str"],
92
+ "additionalProperties": False,
93
+ },
94
+ },
95
+ },
96
+ {
97
+ "type": "function",
98
+ "function": {
99
+ "name": "view",
100
+ "description": "Return the current text.",
101
+ "parameters": {"type": "object", "properties": {}, "additionalProperties": False},
102
+ },
103
+ },
104
+ {
105
+ "type": "function",
106
+ "function": {
107
+ "name": "done",
108
+ "description": "Finish editing.",
109
+ "parameters": {"type": "object", "properties": {}, "additionalProperties": False},
110
+ },
111
+ },
112
+ ]
113
+
114
+ if messages is None:
115
+ rendered_prompt = _render_template(
116
+ prompt_template,
117
+ text=text,
118
+ text_length=len(text),
119
+ error="",
120
+ )
121
+ rendered_system = _render_template(
122
+ system_prompt,
123
+ text=text,
124
+ text_length=len(text),
125
+ error="",
126
+ )
127
+ messages = [
128
+ {"role": "system", "content": rendered_system},
129
+ {"role": "user", "content": rendered_prompt},
130
+ ]
131
+ else:
132
+ messages = list(messages)
133
+
134
+ done = False
135
+ last_error: Optional[str] = None
136
+ current_text = text
137
+
138
+ for _ in range(max_rounds):
139
+ had_tool_error = False
140
+ response = chat_completion(
141
+ client=client,
142
+ messages=messages,
143
+ tools=tools,
144
+ tool_choice="auto",
145
+ )
146
+ tool_calls = response.tool_calls
147
+ if not tool_calls:
148
+ content = response.text or ""
149
+ last_error = "Tool loop requires tool calls (str_replace/view/done)"
150
+ messages.append({"role": "assistant", "content": content})
151
+ messages.append(
152
+ {
153
+ "role": "user",
154
+ "content": _build_no_tool_calls_message(
155
+ assistant_message=content,
156
+ current_text=current_text,
157
+ ),
158
+ }
159
+ )
160
+ continue
161
+ messages.append(
162
+ {
163
+ "role": "assistant",
164
+ "content": response.text or "",
165
+ "tool_calls": list(tool_calls),
166
+ }
167
+ )
168
+ edit_count = 0
169
+ for tool_call in tool_calls:
170
+ function = tool_call.get("function", {})
171
+ name = str(function.get("name") or "")
172
+ args = json.loads(str(function.get("arguments") or "{}"))
173
+ if name == "str_replace":
174
+ edit_count += 1
175
+ if edit_count > max_edits_per_round:
176
+ last_error = "Tool loop exceeded max edits per round"
177
+ tool_result = f"Error: {last_error}"
178
+ else:
179
+ old_str = str(args.get("old_str", ""))
180
+ new_str = str(args.get("new_str", ""))
181
+ if not old_str or not new_str:
182
+ last_error = "Tool loop requires non-empty old_str and new_str"
183
+ tool_result = f"Error: {last_error}"
184
+ else:
185
+ try:
186
+ current_text = apply_str_replace(current_text, old_str, new_str)
187
+ tool_result = (
188
+ "Applied str_replace.\nCurrent text:\n---\n" f"{current_text}\n---"
189
+ )
190
+ last_error = None
191
+ except ValueError as exc:
192
+ last_error = str(exc)
193
+ tool_result = f"Error: {last_error}"
194
+ had_tool_error = True
195
+ elif name == "view":
196
+ tool_result = f"Current text:\n---\n{current_text}\n---"
197
+ elif name == "done":
198
+ done = True
199
+ tool_result = "Done"
200
+ else:
201
+ raise ValueError(f"Tool loop received unknown tool: {name}")
202
+ messages.append(
203
+ {
204
+ "role": "tool",
205
+ "tool_call_id": tool_call.get("id", ""),
206
+ "content": tool_result,
207
+ }
208
+ )
209
+ if had_tool_error and last_error is not None:
210
+ done = False
211
+ messages.append(
212
+ {
213
+ "role": "user",
214
+ "content": _build_tool_error_message(
215
+ error_message=last_error,
216
+ current_text=current_text,
217
+ ),
218
+ }
219
+ )
220
+ continue
221
+ if validate_text is not None:
222
+ validation_errors = list(validate_text(current_text))
223
+ if validation_errors:
224
+ last_error = "; ".join(validation_errors)
225
+ done = False
226
+ retry_message = _build_retry_message(
227
+ validation_errors=validation_errors,
228
+ current_text=current_text,
229
+ build_retry_message=build_retry_message,
230
+ )
231
+ messages.append({"role": "user", "content": retry_message})
232
+ continue
233
+ if done:
234
+ break
235
+
236
+ return ToolLoopResult(
237
+ text=current_text,
238
+ done=done,
239
+ last_error=last_error,
240
+ messages=messages,
241
+ )
242
+
243
+
244
+ def _build_retry_message(
245
+ *,
246
+ validation_errors: Sequence[str],
247
+ current_text: str,
248
+ build_retry_message: Optional[Callable[[Sequence[str], str], str]],
249
+ ) -> str:
250
+ if build_retry_message is not None:
251
+ return build_retry_message(validation_errors, current_text)
252
+ error_lines = "\n".join(f"- {error}" for error in validation_errors)
253
+ return (
254
+ "Your last edit did not validate.\n"
255
+ "Issues:\n"
256
+ f"{error_lines}\n\n"
257
+ "Please fix the markup using str_replace and keep the source text unchanged.\n"
258
+ "Current text:\n"
259
+ f"---\n{current_text}\n---"
260
+ )
261
+
262
+
263
+ def _build_tool_error_message(*, error_message: str, current_text: str) -> str:
264
+ if "not unique" in error_message:
265
+ guidance = (
266
+ "Use a longer unique old_str by including surrounding words or punctuation "
267
+ "so it matches exactly once."
268
+ )
269
+ elif "not found" in error_message:
270
+ guidance = (
271
+ "Copy the exact old_str from the current text (including punctuation/case) "
272
+ "or call view to inspect the latest text."
273
+ )
274
+ else:
275
+ guidance = "Fix the tool call and try again."
276
+ return (
277
+ "Your last tool call failed.\n"
278
+ f"Error: {error_message}\n"
279
+ f"{guidance}\n"
280
+ "Current text:\n"
281
+ f"---\n{current_text}\n---"
282
+ )
283
+
284
+
285
+ def _build_no_tool_calls_message(*, assistant_message: str, current_text: str) -> str:
286
+ guidance = (
287
+ "Use the tools to edit the text. "
288
+ "Call str_replace to insert markup, view to inspect, and done when finished."
289
+ )
290
+ message = "Your last response did not include any tool calls."
291
+ if assistant_message.strip():
292
+ message = f"{message}\nAssistant message: {assistant_message}"
293
+ return f"{message}\n" f"{guidance}\n" "Current text:\n" f"---\n{current_text}\n---"
294
+
295
+
296
+ def request_confirmation(
297
+ *,
298
+ result: ToolLoopResult,
299
+ text: str,
300
+ client: LlmClientConfig,
301
+ system_prompt: str,
302
+ prompt_template: str,
303
+ max_rounds: int,
304
+ max_edits_per_round: int,
305
+ apply_str_replace: Callable[[str, str, str], str],
306
+ confirmation_message: str,
307
+ validate_text: Optional[Callable[[str], Sequence[str]]] = None,
308
+ build_retry_message: Optional[Callable[[Sequence[str], str], str]] = None,
309
+ ) -> ToolLoopResult:
310
+ """
311
+ Continue a tool loop with a confirmation message appended to the conversation history.
312
+
313
+ This preserves the model's prior tool calls and the current text state while giving it
314
+ a chance to confirm an empty/ambiguous result.
315
+ """
316
+ messages = list(result.messages)
317
+ messages.append({"role": "user", "content": confirmation_message})
318
+ return run_tool_loop(
319
+ text=text,
320
+ client=client,
321
+ system_prompt=system_prompt,
322
+ prompt_template=prompt_template,
323
+ max_rounds=max_rounds,
324
+ max_edits_per_round=max_edits_per_round,
325
+ apply_str_replace=apply_str_replace,
326
+ validate_text=validate_text,
327
+ build_retry_message=build_retry_message,
328
+ messages=messages,
329
+ )
330
+
331
+
332
+ def _render_template(template: str, *, text: str, text_length: int, error: str) -> str:
333
+ rendered = template.replace("{text_length}", str(text_length)).replace("{error}", error)
334
+ return rendered.replace("{text}", text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.14.0
3
+ Version: 0.15.1
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -9,6 +9,8 @@ License-File: LICENSE
9
9
  Requires-Dist: pydantic>=2.0
10
10
  Requires-Dist: PyYAML>=6.0
11
11
  Requires-Dist: pypdf>=4.0
12
+ Requires-Dist: Jinja2>=3.1
13
+ Requires-Dist: dotyaml>=0.1.3
12
14
  Provides-Extra: dev
13
15
  Requires-Dist: behave>=1.2.6; extra == "dev"
14
16
  Requires-Dist: coverage[toml]>=7.0; extra == "dev"
@@ -18,6 +20,9 @@ Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
18
20
  Requires-Dist: ruff>=0.4.0; extra == "dev"
19
21
  Requires-Dist: black>=24.0; extra == "dev"
20
22
  Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
23
+ Provides-Extra: dspy
24
+ Requires-Dist: dspy>=2.5; extra == "dspy"
25
+ Requires-Dist: litellm>=1.0; extra == "dspy"
21
26
  Provides-Extra: openai
22
27
  Requires-Dist: openai>=1.0; extra == "openai"
23
28
  Provides-Extra: unstructured
@@ -40,6 +45,8 @@ Provides-Extra: docling-mlx
40
45
  Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
41
46
  Provides-Extra: topic-modeling
42
47
  Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
48
+ Provides-Extra: markov-analysis
49
+ Requires-Dist: hmmlearn>=0.3.0; extra == "markov-analysis"
43
50
  Provides-Extra: datasets
44
51
  Requires-Dist: datasets>=2.18.0; extra == "datasets"
45
52
  Dynamic: license-file
@@ -50,18 +57,33 @@ Dynamic: license-file
50
57
  ![Coverage][coverage-badge]
51
58
  ![Documentation][documentation-badge]
52
59
 
53
- Make your documents usable by your assistant, then decide later how you will search and retrieve them.
54
-
60
+ <p>
61
+ <img
62
+ src="docs/_static/Biblicus-logo.png"
63
+ alt="Biblicus logo"
64
+ align="right"
65
+ width="216"
66
+ />
67
+ Make your documents usable by your assistant, then decide later how you will search and retrieve them.
68
+ </p>
55
69
  If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
56
70
 
57
71
  The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
58
72
 
59
- This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
73
+ Biblicus gives you a normal folder on disk to manage. In Biblicus documentation, that managed folder is called a *corpus* (plural: *corpora*). It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw files.
60
74
 
61
75
  It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or your own setup. Use it from Python or from the command line interface.
62
76
 
63
77
  See [retrieval augmented generation overview] for a short introduction to the idea.
64
78
 
79
+ ## Analysis highlights
80
+
81
+ - `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
82
+ - YAML recipes support cascading composition plus dotted `--config key=value` overrides.
83
+ - Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
84
+ - See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
85
+ - See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
86
+
65
87
  ## Start with a knowledge base
66
88
 
67
89
  If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
@@ -106,7 +128,7 @@ Think in three stages.
106
128
 
107
129
  If you learn a few project words, the rest of the system becomes predictable.
108
130
 
109
- - Corpus is the folder that holds raw items and their metadata.
131
+ - Corpus is the managed folder that holds raw items and their metadata.
110
132
  - Item is the raw bytes plus optional metadata and source information.
111
133
  - Catalog is the rebuildable index of the corpus.
112
134
  - Extraction run is a recorded extraction build that produces text artifacts.
@@ -161,28 +183,28 @@ sequenceDiagram
161
183
  This repository is a working Python package. Install it into a virtual environment from the repository root.
162
184
 
163
185
  ```
164
- python3 -m pip install -e .
186
+ python -m pip install -e .
165
187
  ```
166
188
 
167
189
  After the first release, you can install it from Python Package Index.
168
190
 
169
191
  ```
170
- python3 -m pip install biblicus
192
+ python -m pip install biblicus
171
193
  ```
172
194
 
173
195
  ### Optional extras
174
196
 
175
197
  Some extractors are optional so the base install stays small.
176
198
 
177
- - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
178
- - Advanced optical character recognition with PaddleOCR: `python3 -m pip install "biblicus[paddleocr]"`
179
- - Document understanding with Docling VLM: `python3 -m pip install "biblicus[docling]"`
180
- - Document understanding with Docling VLM and MLX acceleration: `python3 -m pip install "biblicus[docling-mlx]"`
181
- - Speech to text transcription with OpenAI: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
182
- - Speech to text transcription with Deepgram: `python3 -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
183
- - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
184
- - MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
185
- - Topic modeling analysis with BERTopic: `python3 -m pip install "biblicus[topic-modeling]"`
199
+ - Optical character recognition for images: `python -m pip install "biblicus[ocr]"`
200
+ - Advanced optical character recognition with PaddleOCR: `python -m pip install "biblicus[paddleocr]"`
201
+ - Document understanding with Docling VLM: `python -m pip install "biblicus[docling]"`
202
+ - Document understanding with Docling VLM and MLX acceleration: `python -m pip install "biblicus[docling-mlx]"`
203
+ - Speech to text transcription with OpenAI: `python -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
204
+ - Speech to text transcription with Deepgram: `python -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
205
+ - Broad document parsing fallback: `python -m pip install "biblicus[unstructured]"`
206
+ - MarkItDown document conversion (requires Python 3.10 or higher): `python -m pip install "biblicus[markitdown]"`
207
+ - Topic modeling analysis with BERTopic: `python -m pip install "biblicus[topic-modeling]"`
186
208
 
187
209
  ## Quick start
188
210
 
@@ -200,16 +222,49 @@ biblicus build --corpus corpora/example --backend scan
200
222
  biblicus query --corpus corpora/example --query "note"
201
223
  ```
202
224
 
203
- If you want to turn a website section into corpus items, crawl a root web address while restricting the crawl to an allowed prefix:
225
+ ## Web Ingestion
226
+
227
+ Biblicus supports ingesting content directly from the web using two approaches.
228
+
229
+ ### Ingest from URLs
204
230
 
231
+ Ingest individual documents or web pages from URLs. The `ingest` command automatically detects content types including PDF, HTML, Markdown, images, and audio:
232
+
233
+ ```bash
234
+ # Ingest a document from a URL
235
+ biblicus ingest https://example.com/document.pdf --tags "research"
236
+
237
+ # Ingest a web page
238
+ biblicus ingest https://example.com/article.html --tags "article"
239
+
240
+ # Ingest with a corpus path specified
241
+ biblicus ingest --corpus corpora/example https://docs.example.com/guide.md --tags "documentation"
205
242
  ```
206
- biblicus crawl --corpus corpora/example \\
207
- --root-url https://example.com/docs/index.html \\
208
- --allowed-prefix https://example.com/docs/ \\
209
- --max-items 50 \\
210
- --tag crawled
243
+
244
+ ### Crawl Websites
245
+
246
+ Crawl entire website sections with automatic link discovery. The crawler follows links within the allowed prefix and stores discovered content:
247
+
248
+ ```bash
249
+ # Crawl a documentation site
250
+ biblicus crawl \
251
+ --corpus corpora/example \
252
+ --root-url https://docs.example.com/ \
253
+ --allowed-prefix https://docs.example.com/ \
254
+ --max-items 100 \
255
+ --tags "documentation"
256
+
257
+ # Crawl a specific blog category
258
+ biblicus crawl \
259
+ --corpus corpora/example \
260
+ --root-url https://blog.example.com/category/tutorials/ \
261
+ --allowed-prefix https://blog.example.com/category/tutorials/ \
262
+ --max-items 50 \
263
+ --tags "tutorials,blog"
211
264
  ```
212
265
 
266
+ The `--allowed-prefix` parameter restricts the crawler to only follow links that start with the specified URL prefix, preventing it from crawling outside the intended scope. The crawler respects `.biblicusignore` rules and stores items under `raw/imports/crawl/` in your corpus.
267
+
213
268
  ## End-to-end example: lower-level control
214
269
 
215
270
  The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
@@ -490,7 +545,7 @@ Three backends are included.
490
545
 
491
546
  - `scan` is a minimal baseline that scans raw items directly.
492
547
  - `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
493
- - `vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
548
+ - `tf-vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
494
549
 
495
550
  For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
496
551
 
@@ -540,6 +595,21 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
540
595
  For extraction evaluation workflows, dataset formats, and report interpretation, see
541
596
  `docs/EXTRACTION_EVALUATION.md`.
542
597
 
598
+ ## Text extract utility
599
+
600
+ Text extract is a reusable analysis utility that lets a model insert XML tags into a long text without re-emitting the
601
+ entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
602
+ analysis.
603
+
604
+ See `docs/TEXT_EXTRACT.md` for the utility API and examples, and `docs/MARKOV_ANALYSIS.md` for the Markov integration.
605
+
606
+ ## Text slice utility
607
+
608
+ Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
609
+ re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
610
+
611
+ See `docs/TEXT_SLICE.md` for the utility API and examples.
612
+
543
613
  ## Topic modeling analysis
544
614
 
545
615
  Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
@@ -594,7 +664,7 @@ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[t
594
664
  For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
595
665
 
596
666
  ```
597
- python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
667
+ python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
598
668
  ```
599
669
 
600
670
  See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
@@ -608,13 +678,13 @@ Use `scripts/download_pdf_samples.py` to download a small Portable Document Form
608
678
  ## Tests and coverage
609
679
 
610
680
  ```
611
- python3 scripts/test.py
681
+ python scripts/test.py
612
682
  ```
613
683
 
614
684
  To include integration scenarios that download public test data at runtime, run this command.
615
685
 
616
686
  ```
617
- python3 scripts/test.py --integration
687
+ python scripts/test.py --integration
618
688
  ```
619
689
 
620
690
  ## Releases
@@ -632,13 +702,13 @@ Reference documentation is generated from Sphinx style docstrings.
632
702
  Install development dependencies:
633
703
 
634
704
  ```
635
- python3 -m pip install -e ".[dev]"
705
+ python -m pip install -e ".[dev]"
636
706
  ```
637
707
 
638
708
  Build the documentation:
639
709
 
640
710
  ```
641
- python3 -m sphinx -b html docs docs/_build/html
711
+ python -m sphinx -b html docs docs/_build/html
642
712
  ```
643
713
 
644
714
  ## License
@@ -1,6 +1,6 @@
1
- biblicus/__init__.py,sha256=S14cBWrRwFcNLNmhl2viTvy2CXIfK8-vcBixsh5y49U,496
1
+ biblicus/__init__.py,sha256=X0FEFK03CEs4FUqCxWrZmSV1tPd6p5IEd3GgelqqXqk,496
2
2
  biblicus/__main__.py,sha256=ipfkUoTlocVnrQDM69C7TeBqQxmHVeiWMRaT3G9rtnk,117
3
- biblicus/cli.py,sha256=cMoirLFPhTwftNuqaadajCcRUEz_FBaLkupjVxpAxO8,38403
3
+ biblicus/cli.py,sha256=APxBNLztfGTVGgxt5qgCofYti2AsGGeZQ7SB9tkneXQ,41760
4
4
  biblicus/constants.py,sha256=gAlEVJhxdFj-eWWJrlYbP7H1X3c5gwhrIBq9NQ1Vq_E,371
5
5
  biblicus/context.py,sha256=iXRFGpf_5YDPsDsm_iTK6nCvtUWDoYVI7op-l2QU3uA,10189
6
6
  biblicus/corpus.py,sha256=qSDnYJXhWlF2p_BbFLl6xtI53lIIPxwyKLLGLC432Sg,55612
@@ -18,27 +18,32 @@ biblicus/ignore.py,sha256=fyjt34E6tWNNrm1FseOhgH2MgryyVBQVzxhKL5s4aio,1800
18
18
  biblicus/inference.py,sha256=_k00AIPoXD2lruiTB-JUagtY4f_WKcdzA3axwiq1tck,3512
19
19
  biblicus/knowledge_base.py,sha256=JmlJw8WD_fgstuq1PyWVzU9kzvVzyv7_xOvhS70xwUw,6654
20
20
  biblicus/models.py,sha256=r28O6cg3d1bjJnKqpLieVLTgtXTfzb_60wMORvVuDN0,15846
21
+ biblicus/recipes.py,sha256=rqU66QnjOup6O8Y9Yq7XszmpoM0Pyrjw3RrfdnlVqgE,4210
21
22
  biblicus/retrieval.py,sha256=A1SI4WK5cX-WbtN6FJ0QQxqlEOtQhddLrL0LZIuoTC4,4180
22
23
  biblicus/sources.py,sha256=EFy8-rQNLsyzz-98mH-z8gEHMYbqigcNFKLaR92KfDE,7241
23
24
  biblicus/time.py,sha256=3BSKOSo7R10K-0Dzrbdtl3fh5_yShTYqfdlKvvdkx7M,485
24
25
  biblicus/uris.py,sha256=xXD77lqsT9NxbyzI1spX9Y5a3-U6sLYMnpeSAV7g-nM,2013
25
26
  biblicus/user_config.py,sha256=UXUYBNUN4FR37ggZGJG1wv3K8XzsMR8pXW1T18lrivw,6495
26
- biblicus/_vendor/dotyaml/__init__.py,sha256=WAWdbFNFqO5cJPthxA8Kx-L76Bh07sKMosUxC_3o9qA,375
27
+ biblicus/_vendor/dotyaml/__init__.py,sha256=OVv6IsuCvsjaUznLzuit4UbSLVg4TiTVm9cOPY1Y2Cs,409
27
28
  biblicus/_vendor/dotyaml/interpolation.py,sha256=FVUkdQr_KbXjoFPvGTv6I5v0X5iZkJe5yhZtYKRbYzI,1991
28
- biblicus/_vendor/dotyaml/loader.py,sha256=zy_zinR5fiatmRyZSiELHv1vVz1Y2eRSboSf_x3kfi4,5623
29
+ biblicus/_vendor/dotyaml/loader.py,sha256=vFfnhbvHYYyOKzl5iq2FH97GSHH2GvEHmGiPnE0g0kA,6954
29
30
  biblicus/_vendor/dotyaml/transformer.py,sha256=RWNrm_KAsanG409HEIWquTH9i_jz-ZFK9fM86emXeF4,3724
30
- biblicus/analysis/__init__.py,sha256=Z4Wb4d-EoUuGHkcfRm9ILuZ8vr9FBqRxC0u1i6Fp_0w,1288
31
+ biblicus/ai/__init__.py,sha256=HY8PKhqRLIDYJYlL9A2JjqKxQaujITNLYgIytNUhnrU,1161
32
+ biblicus/ai/embeddings.py,sha256=n2xlonZOHcmDrP1XMhGcja5Hzr8r87PF-IecH-Yhu98,3703
33
+ biblicus/ai/llm.py,sha256=g724_UAxmicB_W-Z7Uu9SRsI9-aVNZUlYIjvnlE17VE,4712
34
+ biblicus/ai/models.py,sha256=6newnT0NJf3uf9FvWXVC-9Gkk5xRB-PjXDZpeBHA04Y,7857
35
+ biblicus/analysis/__init__.py,sha256=I4LqxfKPKF3DEVmAyagQ8J1RN-ia3fyfKJ9frCllZQE,1385
31
36
  biblicus/analysis/base.py,sha256=gB4ilvyMpiWU1m_ydy2dIHGP96ZFIFvVUL9iVDZKPJM,1265
32
- biblicus/analysis/llm.py,sha256=VjkZDKauHCDfj-TP-bTbI6a9WAXEIDe8bEiwErPx9xc,3309
33
- biblicus/analysis/models.py,sha256=LuR52w27JRzV-Mr-WAOduZrBOCTrp5uYkMc46QHTRrI,27300
37
+ biblicus/analysis/markov.py,sha256=XiNbfg8lmKWXCzQJhZtZxghT7gBPh2o6xA3D1-tMQqs,61762
38
+ biblicus/analysis/models.py,sha256=FnUAO6n1yjagYlhe2ocRtfb3IZT3W03rrEZ9LdCx7Kc,56214
34
39
  biblicus/analysis/profiling.py,sha256=v2B4Tn9WiXRRP_wIADBPRQVKkMc92KXCas7OBa7n0LU,10670
35
40
  biblicus/analysis/schema.py,sha256=MCiAQJmijVk8iM8rOUYbzyaDwsMR-Oo86iZU5NCbDMM,435
36
- biblicus/analysis/topic_modeling.py,sha256=ZGXvm2MyU6plxz2FE1RQU-3bra6QZ-t8EJj8kG1TW0M,19438
41
+ biblicus/analysis/topic_modeling.py,sha256=mNBiRMpY5Jtyz8Aj-WXYY8guEghx9jozTfgveinJLoc,22135
37
42
  biblicus/backends/__init__.py,sha256=3HJY0oMm8pFFVGC4Z-dlPRHhIPVDdUzsa4IMjKP_9dI,1378
38
43
  biblicus/backends/base.py,sha256=Erfj9dXg0nkRKnEcNjHR9_0Ddb2B1NvbmRksVm_g1dU,1776
39
44
  biblicus/backends/hybrid.py,sha256=FjCzrXdicbM-qVDM-_AHYTT8Gzvem1eZJaQUVKk_tMg,10753
40
45
  biblicus/backends/scan.py,sha256=hdNnQWqi5IH6j95w30BZHxLJ0W9PTaOkqfWJuxCCEMI,12478
41
- biblicus/backends/sqlite_full_text_search.py,sha256=VAn4fDdfiaS1Rn6zHlYz3E10_3vMU9P94QU8cL0l8Mk,24466
46
+ biblicus/backends/sqlite_full_text_search.py,sha256=z0WsURTj4w8Y6SPa6k4bV9wdze-HwNWgxj5qcab0TUE,24480
42
47
  biblicus/backends/vector.py,sha256=3RdxSBPb1kOX4Sfd4d1qXFW9ecuiRvGpOHadLCbeh1g,15183
43
48
  biblicus/extractors/__init__.py,sha256=ci3oldbdQZ8meAfHccM48CqQtZsPSRg3HkPrBSZF15M,2673
44
49
  biblicus/extractors/base.py,sha256=ka-nz_1zHPr4TS9sU4JfOoY-PJh7lbHPBOEBrbQFGSc,2171
@@ -58,9 +63,19 @@ biblicus/extractors/select_override.py,sha256=gSpffFmn1ux9pGtFvHD5Uu_LO8TmmJC4L_
58
63
  biblicus/extractors/select_smart_override.py,sha256=-sLMnNoeXbCB3dO9zflQq324eHuLbd6hpveSwduXP-U,6763
59
64
  biblicus/extractors/select_text.py,sha256=w0ATmDy3tWWbOObzW87jGZuHbgXllUhotX5XyySLs-o,3395
60
65
  biblicus/extractors/unstructured_text.py,sha256=l2S_wD_htu7ZHoJQNQtP-kGlEgOeKV_w2IzAC93lePE,3564
61
- biblicus-0.14.0.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
62
- biblicus-0.14.0.dist-info/METADATA,sha256=caVN-IcUlwaRRdid40XN6FBcdJmFkMBldyW-ySf-6pk,28088
63
- biblicus-0.14.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
64
- biblicus-0.14.0.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
65
- biblicus-0.14.0.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
66
- biblicus-0.14.0.dist-info/RECORD,,
66
+ biblicus/text/__init__.py,sha256=MiaGAY7xWlUCeBzDzNz6pJnSMiU_Ge5EmlSiEzhqTRo,947
67
+ biblicus/text/annotate.py,sha256=asmpj3_s_t8hl6stEg99apmqxAhDTkoPzHhZNggYE3Y,8355
68
+ biblicus/text/extract.py,sha256=pdnUiZWtfCUj7kZK5zhd-tjqokgmhYYheWhyN3iShRU,7669
69
+ biblicus/text/link.py,sha256=Xl0yxD1rvbRJRVdWG_ZP6zgmbpgWSJYcUcNM06-OUWU,20077
70
+ biblicus/text/markup.py,sha256=8jj9aX03HiZTOWdPs_VC4JLpQ7TlPHgGuXj_QUQIHVw,6265
71
+ biblicus/text/models.py,sha256=REp6RowUWFdV-6y437JENP7XtGKt57BOvVtF91KmUqI,10853
72
+ biblicus/text/prompts.py,sha256=Z5fSsy1Xzr0rCI0WZ3djiEQlbRDncyNBQ7_ZoWVPL4g,6704
73
+ biblicus/text/redact.py,sha256=tkDRmA0VvOZwMryEmBPLEHf3Z6VHJkkaWjBaNIMyGZ0,8415
74
+ biblicus/text/slice.py,sha256=dlHxGO8c5P8BszXGwlNQoQ-cyWjJf6PfS1LUBJXXGEE,5762
75
+ biblicus/text/tool_loop.py,sha256=w1PGLBvIemOdi6l0ArdYDVL7zgx-RC76bBOO0PKqpt0,11831
76
+ biblicus-0.15.1.dist-info/licenses/LICENSE,sha256=lw44GXFG_Q0fS8m5VoEvv_xtdBXK26pBcbSPUCXee_Q,1078
77
+ biblicus-0.15.1.dist-info/METADATA,sha256=TujS_sOnPhf7nGhulkFJJh50nskTWtjcNLv-E5LLles,30906
78
+ biblicus-0.15.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
79
+ biblicus-0.15.1.dist-info/entry_points.txt,sha256=BZmO4H8Uz00fyi1RAFryOCGfZgX7eHWkY2NE-G54U5A,47
80
+ biblicus-0.15.1.dist-info/top_level.txt,sha256=sUD_XVZwDxZ29-FBv1MknTGh4mgDXznGuP28KJY_WKc,9
81
+ biblicus-0.15.1.dist-info/RECORD,,