biblicus 0.6.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.6.0/src/biblicus.egg-info → biblicus-0.7.0}/PKG-INFO +20 -3
- {biblicus-0.6.0 → biblicus-0.7.0}/README.md +17 -2
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/DEMOS.md +19 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/EXTRACTION.md +21 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/FEATURE_INDEX.md +2 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/ROADMAP.md +15 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/environment.py +26 -0
- biblicus-0.7.0/features/markitdown_extractor.feature +99 -0
- biblicus-0.7.0/features/steps/markitdown_steps.py +173 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/pyproject.toml +5 -1
- {biblicus-0.6.0 → biblicus-0.7.0}/scripts/test.py +15 -4
- biblicus-0.7.0/scripts/wikipedia_rag_demo.py +212 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/__init__.py +1 -1
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/__init__.py +2 -0
- biblicus-0.7.0/src/biblicus/extractors/markitdown_text.py +128 -0
- {biblicus-0.6.0 → biblicus-0.7.0/src/biblicus.egg-info}/PKG-INFO +20 -3
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus.egg-info/SOURCES.txt +4 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus.egg-info/requires.txt +5 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/LICENSE +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/MANIFEST.in +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/THIRD_PARTY_NOTICES.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/ARCHITECTURE.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/BACKENDS.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/CONTEXT_PACK.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/CORPUS.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/CORPUS_DESIGN.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/KNOWLEDGE_BASE.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/TESTING.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/USER_CONFIGURATION.md +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/api.rst +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/conf.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/docs/index.rst +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/backend_validation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/biblicus_corpus.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/cli_parsing.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/content_sniffing.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/context_pack.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/context_pack_cli.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/crawl.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/error_cases.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/evaluation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/evidence_processing.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/extraction_error_handling.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/extraction_run_lifecycle.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/extraction_selection.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/extraction_selection_longest.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/extractor_pipeline.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/extractor_validation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/frontmatter.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/hook_config_validation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/hook_error_handling.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/import_tree.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/ingest_sources.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_audio_samples.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_image_samples.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_mixed_corpus.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_mixed_extraction.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_ocr_image_extraction.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_pdf_retrieval.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_pdf_samples.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_unstructured_extraction.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/knowledge_base.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/lifecycle_hooks.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/model_validation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/ocr_extractor.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/pdf_text_extraction.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/python_api.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/python_hook_logging.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/query_processing.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/retrieval_budget.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/retrieval_scan.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/retrieval_uses_extraction_run.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/source_loading.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/backend_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/cli_parsing_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/cli_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/context_pack_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/crawl_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/evidence_processing_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/extraction_run_lifecycle_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/extraction_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/extractor_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/knowledge_base_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/openai_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/pdf_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/python_api_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/rapidocr_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/retrieval_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/stt_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/unstructured_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/steps/user_config_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/streaming_ingest.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/stt_extractor.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/text_extraction_runs.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/token_budget.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/unstructured_extractor.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/features/user_config.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/scripts/download_audio_samples.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/scripts/download_image_samples.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/scripts/download_mixed_samples.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/scripts/download_pdf_samples.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/scripts/download_wikipedia.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/scripts/readme_end_to_end_demo.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/setup.cfg +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/backends/__init__.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/backends/base.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/backends/scan.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/backends/sqlite_full_text_search.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/cli.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/constants.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/context.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/corpus.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/crawl.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/errors.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/evaluation.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/evidence_processing.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extraction.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/base.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/metadata_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/openai_stt.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/pass_through_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/pdf_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/pipeline.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/select_longest_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/select_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/extractors/unstructured_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/frontmatter.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/hook_logging.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/hook_manager.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/hooks.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/ignore.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/knowledge_base.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/models.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/retrieval.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/sources.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/time.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/uris.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus/user_config.py +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.6.0 → biblicus-0.7.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -25,6 +25,8 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
|
25
25
|
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
26
|
Provides-Extra: ocr
|
|
27
27
|
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
+
Provides-Extra: markitdown
|
|
29
|
+
Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
|
|
28
30
|
Dynamic: license-file
|
|
29
31
|
|
|
30
32
|
# Biblicus
|
|
@@ -67,7 +69,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
|
|
|
67
69
|
This simplified sequence diagram shows the same idea at a high level.
|
|
68
70
|
|
|
69
71
|
```mermaid
|
|
70
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
72
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
71
73
|
sequenceDiagram
|
|
72
74
|
participant App as Your assistant code
|
|
73
75
|
participant KB as Knowledge base
|
|
@@ -106,7 +108,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
|
|
|
106
108
|
This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
|
|
107
109
|
|
|
108
110
|
```mermaid
|
|
109
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
111
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
110
112
|
sequenceDiagram
|
|
111
113
|
participant User
|
|
112
114
|
participant App as Your assistant code
|
|
@@ -160,6 +162,7 @@ Some extractors are optional so the base install stays small.
|
|
|
160
162
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
161
163
|
- Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
162
164
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
165
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
163
166
|
|
|
164
167
|
## Quick start
|
|
165
168
|
|
|
@@ -467,6 +470,20 @@ Two backends are included.
|
|
|
467
470
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
468
471
|
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
|
|
469
472
|
|
|
473
|
+
## Extraction backends
|
|
474
|
+
|
|
475
|
+
These extractors are built in. Optional ones require extra dependencies.
|
|
476
|
+
|
|
477
|
+
- `pass-through-text` reads text items and strips Markdown front matter.
|
|
478
|
+
- `metadata-text` turns catalog metadata into a small text artifact.
|
|
479
|
+
- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
|
|
480
|
+
- `select-text` chooses one prior extraction result in a pipeline.
|
|
481
|
+
- `select-longest-text` chooses the longest prior extraction result.
|
|
482
|
+
- `ocr-rapidocr` does optical character recognition on images (optional).
|
|
483
|
+
- `stt-openai` performs speech to text on audio (optional).
|
|
484
|
+
- `unstructured` provides broad document parsing (optional).
|
|
485
|
+
- `markitdown` converts many formats into Markdown-like text (optional).
|
|
486
|
+
|
|
470
487
|
## Integration corpus and evaluation dataset
|
|
471
488
|
|
|
472
489
|
Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
|
|
@@ -38,7 +38,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
|
|
|
38
38
|
This simplified sequence diagram shows the same idea at a high level.
|
|
39
39
|
|
|
40
40
|
```mermaid
|
|
41
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
41
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
42
42
|
sequenceDiagram
|
|
43
43
|
participant App as Your assistant code
|
|
44
44
|
participant KB as Knowledge base
|
|
@@ -77,7 +77,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
|
|
|
77
77
|
This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
|
|
78
78
|
|
|
79
79
|
```mermaid
|
|
80
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
80
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
81
81
|
sequenceDiagram
|
|
82
82
|
participant User
|
|
83
83
|
participant App as Your assistant code
|
|
@@ -131,6 +131,7 @@ Some extractors are optional so the base install stays small.
|
|
|
131
131
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
132
132
|
- Speech to text transcription: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
133
133
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
134
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
134
135
|
|
|
135
136
|
## Quick start
|
|
136
137
|
|
|
@@ -438,6 +439,20 @@ Two backends are included.
|
|
|
438
439
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
439
440
|
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in Sqlite.
|
|
440
441
|
|
|
442
|
+
## Extraction backends
|
|
443
|
+
|
|
444
|
+
These extractors are built in. Optional ones require extra dependencies.
|
|
445
|
+
|
|
446
|
+
- `pass-through-text` reads text items and strips Markdown front matter.
|
|
447
|
+
- `metadata-text` turns catalog metadata into a small text artifact.
|
|
448
|
+
- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
|
|
449
|
+
- `select-text` chooses one prior extraction result in a pipeline.
|
|
450
|
+
- `select-longest-text` chooses the longest prior extraction result.
|
|
451
|
+
- `ocr-rapidocr` does optical character recognition on images (optional).
|
|
452
|
+
- `stt-openai` performs speech to text on audio (optional).
|
|
453
|
+
- `unstructured` provides broad document parsing (optional).
|
|
454
|
+
- `markitdown` converts many formats into Markdown-like text (optional).
|
|
455
|
+
|
|
441
456
|
## Integration corpus and evaluation dataset
|
|
442
457
|
|
|
443
458
|
Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
|
|
@@ -221,6 +221,25 @@ python3 -m biblicus build --corpus corpora/pdf_samples --backend sqlite-full-tex
|
|
|
221
221
|
python3 -m biblicus query --corpus corpora/pdf_samples --query "Dummy PDF file"
|
|
222
222
|
```
|
|
223
223
|
|
|
224
|
+
### Wikipedia retrieval demo (Python)
|
|
225
|
+
|
|
226
|
+
This example downloads a few Wikipedia summaries about retrieval and knowledge bases, builds an extraction run, creates a local full text index, and returns evidence plus a context pack.
|
|
227
|
+
|
|
228
|
+
```
|
|
229
|
+
rm -rf corpora/wikipedia_rag_demo
|
|
230
|
+
python3 scripts/wikipedia_rag_demo.py --corpus corpora/wikipedia_rag_demo --force
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### MarkItDown extraction demo (Python 3.10+)
|
|
234
|
+
|
|
235
|
+
MarkItDown requires Python 3.10 or higher. This example uses the `py311` conda environment to run the extractor over the mixed sample corpus.
|
|
236
|
+
|
|
237
|
+
```
|
|
238
|
+
conda run -n py311 python -m pip install -e . "markitdown[all]"
|
|
239
|
+
conda run -n py311 python scripts/download_mixed_samples.py --corpus corpora/markitdown_demo_py311 --force
|
|
240
|
+
conda run -n py311 python -m biblicus extract build --corpus corpora/markitdown_demo_py311 --step markitdown
|
|
241
|
+
```
|
|
242
|
+
|
|
224
243
|
### Mixed modality integration corpus
|
|
225
244
|
|
|
226
245
|
This example assembles a tiny mixed corpus with a Markdown note, a Hypertext Markup Language page, an image, a Portable Document Format file with extractable text, and a generated Portable Document Format file with no extractable text.
|
|
@@ -71,6 +71,27 @@ To install:
|
|
|
71
71
|
python3 -m pip install "biblicus[unstructured]"
|
|
72
72
|
```
|
|
73
73
|
|
|
74
|
+
`markitdown`
|
|
75
|
+
|
|
76
|
+
- Converts common document formats into Markdown-like text
|
|
77
|
+
- Backed by the optional `markitdown` dependency
|
|
78
|
+
- Requires Python 3.10 or higher
|
|
79
|
+
- Skips items that are already text so the pass-through extractor remains the canonical choice for text items
|
|
80
|
+
- This means it will not process `text/html` or other text media types unless that policy changes
|
|
81
|
+
|
|
82
|
+
To install:
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
python3 -m pip install "biblicus[markitdown]"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \\
|
|
92
|
+
--step markitdown
|
|
93
|
+
```
|
|
94
|
+
|
|
74
95
|
`ocr-rapidocr`
|
|
75
96
|
|
|
76
97
|
- Optical character recognition for image items
|
|
@@ -123,6 +123,7 @@ What it does:
|
|
|
123
123
|
- Includes a Portable Document Format text extractor plugin.
|
|
124
124
|
- Includes a speech to text extractor plugin for audio items.
|
|
125
125
|
- Includes a selection extractor step for choosing extracted text within a pipeline.
|
|
126
|
+
- Includes a MarkItDown extractor plugin for document conversion.
|
|
126
127
|
|
|
127
128
|
Documentation:
|
|
128
129
|
|
|
@@ -139,6 +140,7 @@ Behavior specifications:
|
|
|
139
140
|
- `features/ocr_extractor.feature`
|
|
140
141
|
- `features/stt_extractor.feature`
|
|
141
142
|
- `features/unstructured_extractor.feature`
|
|
143
|
+
- `features/markitdown_extractor.feature`
|
|
142
144
|
- `features/integration_unstructured_extraction.feature`
|
|
143
145
|
|
|
144
146
|
Primary implementation:
|
|
@@ -124,6 +124,21 @@ Acceptance checks:
|
|
|
124
124
|
|
|
125
125
|
These are valuable, but intentionally not the near-term focus while retrieval becomes practical end to end.
|
|
126
126
|
|
|
127
|
+
### In-memory corpus for ephemeral workflows
|
|
128
|
+
|
|
129
|
+
Goal: allow programmatic, temporary corpora that live in memory for short-lived agents or tests.
|
|
130
|
+
|
|
131
|
+
Deliverables:
|
|
132
|
+
|
|
133
|
+
- A memory-backed corpus implementation that supports the same ingestion and catalog APIs.
|
|
134
|
+
- A serialization option for snapshots so ephemeral corpora can be persisted when needed.
|
|
135
|
+
- Documentation that explains tradeoffs versus file-based corpora.
|
|
136
|
+
|
|
137
|
+
Acceptance checks:
|
|
138
|
+
|
|
139
|
+
- Behavior specifications cover ingestion, listing, and reindexing in memory.
|
|
140
|
+
- Retrieval and extraction can operate on the in-memory corpus without special casing.
|
|
141
|
+
|
|
127
142
|
### Extractor datasets and evaluation harness
|
|
128
143
|
|
|
129
144
|
Goal: compare extraction approaches in a way that is measurable, repeatable, and useful for practical engineering decisions.
|
|
@@ -134,6 +134,32 @@ def after_scenario(context, scenario) -> None:
|
|
|
134
134
|
sys.modules.pop(name, None)
|
|
135
135
|
context._fake_rapidocr_unavailable_installed = False
|
|
136
136
|
context._fake_rapidocr_unavailable_original_modules = {}
|
|
137
|
+
if getattr(context, "_fake_markitdown_installed", False):
|
|
138
|
+
original_modules = getattr(context, "_fake_markitdown_original_modules", {})
|
|
139
|
+
for name in [
|
|
140
|
+
"markitdown",
|
|
141
|
+
]:
|
|
142
|
+
if name in original_modules:
|
|
143
|
+
sys.modules[name] = original_modules[name]
|
|
144
|
+
else:
|
|
145
|
+
sys.modules.pop(name, None)
|
|
146
|
+
context._fake_markitdown_installed = False
|
|
147
|
+
context._fake_markitdown_original_modules = {}
|
|
148
|
+
if getattr(context, "_fake_markitdown_unavailable_installed", False):
|
|
149
|
+
original_modules = getattr(context, "_fake_markitdown_unavailable_original_modules", {})
|
|
150
|
+
for name in [
|
|
151
|
+
"markitdown",
|
|
152
|
+
]:
|
|
153
|
+
if name in original_modules:
|
|
154
|
+
sys.modules[name] = original_modules[name]
|
|
155
|
+
else:
|
|
156
|
+
sys.modules.pop(name, None)
|
|
157
|
+
context._fake_markitdown_unavailable_installed = False
|
|
158
|
+
context._fake_markitdown_unavailable_original_modules = {}
|
|
159
|
+
original_sys_version_info = getattr(context, "_original_sys_version_info", None)
|
|
160
|
+
if original_sys_version_info is not None:
|
|
161
|
+
sys.version_info = original_sys_version_info
|
|
162
|
+
context._original_sys_version_info = None
|
|
137
163
|
if hasattr(context, "_tmp"):
|
|
138
164
|
context._tmp.cleanup()
|
|
139
165
|
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
Feature: MarkItDown extractor plugin
|
|
2
|
+
The MarkItDown extractor converts non-text items into Markdown-like text as an optional dependency.
|
|
3
|
+
|
|
4
|
+
Scenario: MarkItDown extractor requires an optional dependency
|
|
5
|
+
Given I initialized a corpus at "corpus"
|
|
6
|
+
And the MarkItDown dependency is unavailable
|
|
7
|
+
And a Portable Document Format file "hello.pdf" exists with text "Hello"
|
|
8
|
+
When I ingest the file "hello.pdf" into corpus "corpus"
|
|
9
|
+
And I attempt to build a "markitdown" extraction run in corpus "corpus"
|
|
10
|
+
Then the command fails with exit code 2
|
|
11
|
+
And standard error includes "biblicus[markitdown]"
|
|
12
|
+
|
|
13
|
+
Scenario: MarkItDown extractor rejects unsupported Python versions
|
|
14
|
+
Given I initialized a corpus at "corpus"
|
|
15
|
+
And a fake MarkItDown library is available but marked as real
|
|
16
|
+
And a Portable Document Format file "hello.pdf" exists with text "Hello"
|
|
17
|
+
When I ingest the file "hello.pdf" into corpus "corpus"
|
|
18
|
+
And I attempt to build a "markitdown" extraction run in corpus "corpus"
|
|
19
|
+
Then the command fails with exit code 2
|
|
20
|
+
And standard error includes "Python 3.10"
|
|
21
|
+
|
|
22
|
+
Scenario: MarkItDown extractor skips text items
|
|
23
|
+
Given I initialized a corpus at "corpus"
|
|
24
|
+
And a fake MarkItDown library is available
|
|
25
|
+
When I ingest the text "alpha" with title "Alpha" and tags "a" into corpus "corpus"
|
|
26
|
+
And I build a "markitdown" extraction run in corpus "corpus"
|
|
27
|
+
Then the extraction run does not include extracted text for the last ingested item
|
|
28
|
+
|
|
29
|
+
Scenario: MarkItDown extractor produces extracted text for a non-text item
|
|
30
|
+
Given I initialized a corpus at "corpus"
|
|
31
|
+
And a fake MarkItDown library is available that returns text "Extracted by MarkItDown" for filename "doc.pdf"
|
|
32
|
+
And a binary file "doc.pdf" exists
|
|
33
|
+
When I ingest the file "doc.pdf" into corpus "corpus"
|
|
34
|
+
And I build a "markitdown" extraction run in corpus "corpus"
|
|
35
|
+
Then the extraction run includes extracted text for the last ingested item
|
|
36
|
+
And the extracted text for the last ingested item equals "Extracted by MarkItDown"
|
|
37
|
+
And the extraction run item provenance uses extractor "markitdown"
|
|
38
|
+
|
|
39
|
+
Scenario: MarkItDown extractor records empty output when it cannot extract text
|
|
40
|
+
Given I initialized a corpus at "corpus"
|
|
41
|
+
And a fake MarkItDown library is available that returns empty output for filename "empty.pdf"
|
|
42
|
+
And a binary file "empty.pdf" exists
|
|
43
|
+
When I ingest the file "empty.pdf" into corpus "corpus"
|
|
44
|
+
And I build a "markitdown" extraction run in corpus "corpus"
|
|
45
|
+
Then the extraction run includes extracted text for the last ingested item
|
|
46
|
+
And the extracted text for the last ingested item is empty
|
|
47
|
+
And the extraction run stats include extracted_empty_items 1
|
|
48
|
+
|
|
49
|
+
Scenario: MarkItDown extractor records empty output when conversion returns None
|
|
50
|
+
Given I initialized a corpus at "corpus"
|
|
51
|
+
And a fake MarkItDown library is available that returns None for filename "none.pdf"
|
|
52
|
+
And a binary file "none.pdf" exists
|
|
53
|
+
When I ingest the file "none.pdf" into corpus "corpus"
|
|
54
|
+
And I build a "markitdown" extraction run in corpus "corpus"
|
|
55
|
+
Then the extraction run includes extracted text for the last ingested item
|
|
56
|
+
And the extracted text for the last ingested item is empty
|
|
57
|
+
And the extraction run stats include extracted_empty_items 1
|
|
58
|
+
|
|
59
|
+
Scenario: MarkItDown extractor accepts string results
|
|
60
|
+
Given I initialized a corpus at "corpus"
|
|
61
|
+
And a fake MarkItDown library is available that returns a string for filename "string.pdf"
|
|
62
|
+
And a binary file "string.pdf" exists
|
|
63
|
+
When I ingest the file "string.pdf" into corpus "corpus"
|
|
64
|
+
And I build a "markitdown" extraction run in corpus "corpus"
|
|
65
|
+
Then the extraction run includes extracted text for the last ingested item
|
|
66
|
+
And the extracted text for the last ingested item equals "Extracted by MarkItDown"
|
|
67
|
+
And the extraction run item provenance uses extractor "markitdown"
|
|
68
|
+
|
|
69
|
+
Scenario: MarkItDown extractor records empty output for non-text conversion output
|
|
70
|
+
Given I initialized a corpus at "corpus"
|
|
71
|
+
And a fake MarkItDown library is available that returns non-text output for filename "nonstr.pdf"
|
|
72
|
+
And a binary file "nonstr.pdf" exists
|
|
73
|
+
When I ingest the file "nonstr.pdf" into corpus "corpus"
|
|
74
|
+
And I build a "markitdown" extraction run in corpus "corpus"
|
|
75
|
+
Then the extracted text for the last ingested item is empty
|
|
76
|
+
And the extraction run stats include extracted_empty_items 1
|
|
77
|
+
|
|
78
|
+
Scenario: MarkItDown extractor ignores whitespace output
|
|
79
|
+
Given I initialized a corpus at "corpus"
|
|
80
|
+
And a fake MarkItDown library is available that returns whitespace output for filename "whitespace.pdf"
|
|
81
|
+
And a binary file "whitespace.pdf" exists
|
|
82
|
+
When I ingest the file "whitespace.pdf" into corpus "corpus"
|
|
83
|
+
And I build a "markitdown" extraction run in corpus "corpus"
|
|
84
|
+
Then the extracted text for the last ingested item is empty
|
|
85
|
+
And the extraction run stats include extracted_empty_items 1
|
|
86
|
+
|
|
87
|
+
Scenario: MarkItDown extractor records per-item errors and continues
|
|
88
|
+
Given I initialized a corpus at "corpus"
|
|
89
|
+
And a fake MarkItDown library is available that raises a RuntimeError for filename "boom.pdf"
|
|
90
|
+
And a binary file "boom.pdf" exists
|
|
91
|
+
And a fake MarkItDown library is available that returns text "ok" for filename "ok.pdf"
|
|
92
|
+
And a binary file "ok.pdf" exists
|
|
93
|
+
When I ingest the file "boom.pdf" into corpus "corpus"
|
|
94
|
+
And I ingest the file "ok.pdf" into corpus "corpus"
|
|
95
|
+
And I build a "markitdown" extraction run in corpus "corpus"
|
|
96
|
+
Then the extracted text for the last ingested item equals "ok"
|
|
97
|
+
And the extraction run includes an errored result for the first ingested item
|
|
98
|
+
And the extraction run error type for the first ingested item equals "RuntimeError"
|
|
99
|
+
And the extraction run stats include errored_items 1
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import types
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict, Optional
|
|
7
|
+
|
|
8
|
+
from behave import given
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class _FakeMarkItDownBehavior:
|
|
13
|
+
mode: str
|
|
14
|
+
text: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _ensure_fake_markitdown_behaviors(context) -> Dict[str, _FakeMarkItDownBehavior]:
|
|
18
|
+
behaviors = getattr(context, "fake_markitdown_behaviors", None)
|
|
19
|
+
if behaviors is None:
|
|
20
|
+
behaviors = {}
|
|
21
|
+
context.fake_markitdown_behaviors = behaviors
|
|
22
|
+
return behaviors
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _install_fake_markitdown_module(context) -> None:
|
|
26
|
+
already_installed = getattr(context, "_fake_markitdown_installed", False)
|
|
27
|
+
if already_installed:
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
original_modules: Dict[str, object] = {}
|
|
31
|
+
module_names = [
|
|
32
|
+
"markitdown",
|
|
33
|
+
]
|
|
34
|
+
for name in module_names:
|
|
35
|
+
if name in sys.modules:
|
|
36
|
+
original_modules[name] = sys.modules[name]
|
|
37
|
+
|
|
38
|
+
behaviors = _ensure_fake_markitdown_behaviors(context)
|
|
39
|
+
|
|
40
|
+
class _ConversionResult:
|
|
41
|
+
def __init__(self, text_content: object) -> None:
|
|
42
|
+
self.text_content = text_content
|
|
43
|
+
|
|
44
|
+
class MarkItDown:
|
|
45
|
+
def __init__(self, *, enable_plugins: bool = False) -> None:
|
|
46
|
+
self.enable_plugins = enable_plugins
|
|
47
|
+
|
|
48
|
+
def convert(self, filename: str) -> object:
|
|
49
|
+
base_name = filename.rsplit("/", 1)[-1]
|
|
50
|
+
normalized_name = base_name.split("--", 1)[-1] if "--" in base_name else base_name
|
|
51
|
+
behavior = behaviors.get(normalized_name)
|
|
52
|
+
if behavior is None:
|
|
53
|
+
return _ConversionResult("")
|
|
54
|
+
if behavior.mode == "error":
|
|
55
|
+
raise RuntimeError("fake markitdown error")
|
|
56
|
+
if behavior.mode == "empty":
|
|
57
|
+
return _ConversionResult("")
|
|
58
|
+
if behavior.mode == "none":
|
|
59
|
+
return None
|
|
60
|
+
if behavior.mode == "string":
|
|
61
|
+
return behavior.text or ""
|
|
62
|
+
if behavior.mode == "nonstring":
|
|
63
|
+
return _ConversionResult(123)
|
|
64
|
+
if behavior.mode == "whitespace":
|
|
65
|
+
return _ConversionResult(" ")
|
|
66
|
+
if behavior.mode == "text":
|
|
67
|
+
return _ConversionResult(behavior.text or "")
|
|
68
|
+
return _ConversionResult("")
|
|
69
|
+
|
|
70
|
+
markitdown_module = types.ModuleType("markitdown")
|
|
71
|
+
markitdown_module.MarkItDown = MarkItDown
|
|
72
|
+
markitdown_module.__biblicus_fake__ = True
|
|
73
|
+
|
|
74
|
+
sys.modules["markitdown"] = markitdown_module
|
|
75
|
+
|
|
76
|
+
context._fake_markitdown_installed = True
|
|
77
|
+
context._fake_markitdown_original_modules = original_modules
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _install_markitdown_unavailable_module(context) -> None:
|
|
81
|
+
already_installed = getattr(context, "_fake_markitdown_unavailable_installed", False)
|
|
82
|
+
if already_installed:
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
original_modules: Dict[str, object] = {}
|
|
86
|
+
if "markitdown" in sys.modules:
|
|
87
|
+
original_modules["markitdown"] = sys.modules["markitdown"]
|
|
88
|
+
|
|
89
|
+
markitdown_module = types.ModuleType("markitdown")
|
|
90
|
+
sys.modules["markitdown"] = markitdown_module
|
|
91
|
+
|
|
92
|
+
context._fake_markitdown_unavailable_installed = True
|
|
93
|
+
context._fake_markitdown_unavailable_original_modules = original_modules
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@given("a fake MarkItDown library is available")
|
|
97
|
+
def step_fake_markitdown_available(context) -> None:
|
|
98
|
+
_install_fake_markitdown_module(context)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@given(
|
|
102
|
+
'a fake MarkItDown library is available that returns text "{text}" for filename "{filename}"'
|
|
103
|
+
)
|
|
104
|
+
def step_fake_markitdown_returns_text(context, text: str, filename: str) -> None:
|
|
105
|
+
_install_fake_markitdown_module(context)
|
|
106
|
+
behaviors = _ensure_fake_markitdown_behaviors(context)
|
|
107
|
+
behaviors[filename] = _FakeMarkItDownBehavior(mode="text", text=text)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@given(
|
|
111
|
+
'a fake MarkItDown library is available that returns empty output for filename "{filename}"'
|
|
112
|
+
)
|
|
113
|
+
def step_fake_markitdown_returns_empty(context, filename: str) -> None:
|
|
114
|
+
_install_fake_markitdown_module(context)
|
|
115
|
+
behaviors = _ensure_fake_markitdown_behaviors(context)
|
|
116
|
+
behaviors[filename] = _FakeMarkItDownBehavior(mode="empty", text=None)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@given('a fake MarkItDown library is available that returns None for filename "{filename}"')
|
|
120
|
+
def step_fake_markitdown_returns_none(context, filename: str) -> None:
|
|
121
|
+
_install_fake_markitdown_module(context)
|
|
122
|
+
behaviors = _ensure_fake_markitdown_behaviors(context)
|
|
123
|
+
behaviors[filename] = _FakeMarkItDownBehavior(mode="none", text=None)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@given('a fake MarkItDown library is available that returns a string for filename "{filename}"')
|
|
127
|
+
def step_fake_markitdown_returns_string(context, filename: str) -> None:
|
|
128
|
+
_install_fake_markitdown_module(context)
|
|
129
|
+
behaviors = _ensure_fake_markitdown_behaviors(context)
|
|
130
|
+
behaviors[filename] = _FakeMarkItDownBehavior(mode="string", text="Extracted by MarkItDown")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@given(
|
|
134
|
+
'a fake MarkItDown library is available that returns non-text output for filename "{filename}"'
|
|
135
|
+
)
|
|
136
|
+
def step_fake_markitdown_returns_nonstring(context, filename: str) -> None:
|
|
137
|
+
_install_fake_markitdown_module(context)
|
|
138
|
+
behaviors = _ensure_fake_markitdown_behaviors(context)
|
|
139
|
+
behaviors[filename] = _FakeMarkItDownBehavior(mode="nonstring", text=None)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@given(
|
|
143
|
+
'a fake MarkItDown library is available that raises a RuntimeError for filename "{filename}"'
|
|
144
|
+
)
|
|
145
|
+
def step_fake_markitdown_raises_error(context, filename: str) -> None:
|
|
146
|
+
_install_fake_markitdown_module(context)
|
|
147
|
+
behaviors = _ensure_fake_markitdown_behaviors(context)
|
|
148
|
+
behaviors[filename] = _FakeMarkItDownBehavior(mode="error", text=None)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@given(
|
|
152
|
+
'a fake MarkItDown library is available that returns whitespace output for filename "{filename}"'
|
|
153
|
+
)
|
|
154
|
+
def step_fake_markitdown_returns_whitespace(context, filename: str) -> None:
|
|
155
|
+
_install_fake_markitdown_module(context)
|
|
156
|
+
behaviors = _ensure_fake_markitdown_behaviors(context)
|
|
157
|
+
behaviors[filename] = _FakeMarkItDownBehavior(mode="whitespace", text=None)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@given("the MarkItDown dependency is unavailable")
|
|
161
|
+
def step_markitdown_dependency_unavailable(context) -> None:
|
|
162
|
+
_install_markitdown_unavailable_module(context)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@given("a fake MarkItDown library is available but marked as real")
|
|
166
|
+
def step_fake_markitdown_marked_real(context) -> None:
|
|
167
|
+
_install_fake_markitdown_module(context)
|
|
168
|
+
markitdown_module = sys.modules.get("markitdown")
|
|
169
|
+
if markitdown_module is not None:
|
|
170
|
+
markitdown_module.__biblicus_fake__ = False
|
|
171
|
+
if not hasattr(context, "_original_sys_version_info"):
|
|
172
|
+
context._original_sys_version_info = sys.version_info
|
|
173
|
+
sys.version_info = (3, 9, 0, "final", 0)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "biblicus"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.7.0"
|
|
8
8
|
description = "Command line interface and Python library for corpus ingestion, retrieval, and evaluation."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -36,6 +36,9 @@ unstructured = [
|
|
|
36
36
|
ocr = [
|
|
37
37
|
"rapidocr-onnxruntime>=1.3.0",
|
|
38
38
|
]
|
|
39
|
+
markitdown = [
|
|
40
|
+
"markitdown[all]>=0.1.0; python_version>='3.10'",
|
|
41
|
+
]
|
|
39
42
|
|
|
40
43
|
[project.scripts]
|
|
41
44
|
biblicus = "biblicus.cli:main"
|
|
@@ -54,6 +57,7 @@ omit = ["*/biblicus/_vendor/*"]
|
|
|
54
57
|
[tool.coverage.report]
|
|
55
58
|
show_missing = true
|
|
56
59
|
skip_covered = false
|
|
60
|
+
fail_under = 100
|
|
57
61
|
exclude_lines = [
|
|
58
62
|
"pragma: no cover",
|
|
59
63
|
"if __name__ == .__main__.:",
|
|
@@ -64,6 +64,8 @@ def main() -> int:
|
|
|
64
64
|
Scenarios that require the optional Unstructured dependency are tagged ``@unstructured``
|
|
65
65
|
and are excluded unless you also pass ``--unstructured``.
|
|
66
66
|
|
|
67
|
+
The coverage report enforces the configured minimum coverage threshold.
|
|
68
|
+
|
|
67
69
|
:return: Exit code.
|
|
68
70
|
:rtype: int
|
|
69
71
|
"""
|
|
@@ -100,12 +102,21 @@ def main() -> int:
|
|
|
100
102
|
behave_args.extend(["--tags", "~@ocr"])
|
|
101
103
|
if args.integration and not args.unstructured:
|
|
102
104
|
behave_args.extend(["--tags", "~@unstructured"])
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
105
|
+
behave_exit_code = _run(
|
|
106
|
+
[sys.executable, "-m", "coverage", "run", "-m", "behave", *behave_args],
|
|
107
|
+
env=env,
|
|
108
|
+
)
|
|
109
|
+
coverage_report_exit_code = _run(
|
|
110
|
+
[sys.executable, "-m", "coverage", "report", "-m"],
|
|
111
|
+
env=env,
|
|
112
|
+
)
|
|
113
|
+
coverage_html_exit_code = _run(
|
|
114
|
+
[sys.executable, "-m", "coverage", "html", "-d", str(htmlcov_dir)],
|
|
115
|
+
env=env,
|
|
116
|
+
)
|
|
106
117
|
|
|
107
118
|
print(f"Coverage report in Hypertext Markup Language: {htmlcov_dir / 'index.html'}")
|
|
108
|
-
return int(
|
|
119
|
+
return int(max(behave_exit_code, coverage_report_exit_code, coverage_html_exit_code))
|
|
109
120
|
|
|
110
121
|
|
|
111
122
|
if __name__ == "__main__":
|