perag 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. perag-0.1.0/.gitignore +221 -0
  2. perag-0.1.0/.idea/.gitignore +3 -0
  3. perag-0.1.0/.idea/encodings.xml +4 -0
  4. perag-0.1.0/.idea/inspectionProfiles/Project_Default.xml +13 -0
  5. perag-0.1.0/.idea/inspectionProfiles/profiles_settings.xml +6 -0
  6. perag-0.1.0/.idea/misc.xml +7 -0
  7. perag-0.1.0/.idea/modules.xml +8 -0
  8. perag-0.1.0/.idea/perag.iml +8 -0
  9. perag-0.1.0/.idea/vcs.xml +6 -0
  10. perag-0.1.0/.idea/workspace.xml +82 -0
  11. perag-0.1.0/.python-version +1 -0
  12. perag-0.1.0/CLAUDE.md +312 -0
  13. perag-0.1.0/LICENSE +201 -0
  14. perag-0.1.0/PKG-INFO +19 -0
  15. perag-0.1.0/chunkers/__init__.py +0 -0
  16. perag-0.1.0/chunkers/base.py +10 -0
  17. perag-0.1.0/chunkers/docx.py +31 -0
  18. perag-0.1.0/chunkers/markdown.py +57 -0
  19. perag-0.1.0/chunkers/pdf.py +50 -0
  20. perag-0.1.0/chunkers/registry.py +26 -0
  21. perag-0.1.0/chunkers/text.py +48 -0
  22. perag-0.1.0/config.example.toml +10 -0
  23. perag-0.1.0/db/__init__.py +0 -0
  24. perag-0.1.0/db/search.py +38 -0
  25. perag-0.1.0/db/store.py +109 -0
  26. perag-0.1.0/docs/chunking.md +23 -0
  27. perag-0.1.0/docs/embedders.md +35 -0
  28. perag-0.1.0/docs/pipeline.md +46 -0
  29. perag-0.1.0/embedders/__init__.py +0 -0
  30. perag-0.1.0/embedders/base.py +17 -0
  31. perag-0.1.0/embedders/local.py +27 -0
  32. perag-0.1.0/embedders/ollama.py +29 -0
  33. perag-0.1.0/embedders/openai.py +30 -0
  34. perag-0.1.0/embedders/registry.py +18 -0
  35. perag-0.1.0/perag/__init__.py +0 -0
  36. perag-0.1.0/perag/cli.py +189 -0
  37. perag-0.1.0/perag/config.py +63 -0
  38. perag-0.1.0/perag/schema.py +36 -0
  39. perag-0.1.0/pyproject.toml +40 -0
  40. perag-0.1.0/release.sh +20 -0
  41. perag-0.1.0/skills/SKILL.md +120 -0
  42. perag-0.1.0/tests/__init__.py +0 -0
  43. perag-0.1.0/tests/fixtures/sample.md +17 -0
  44. perag-0.1.0/tests/fixtures/sample.txt +15 -0
  45. perag-0.1.0/tests/test_chunkers.py +53 -0
  46. perag-0.1.0/tests/test_db.py +76 -0
  47. perag-0.1.0/tests/test_embedders.py +64 -0
  48. perag-0.1.0/tests/test_pipeline.py +33 -0
  49. perag-0.1.0/uv.lock +1752 -0
perag-0.1.0/.gitignore ADDED
@@ -0,0 +1,221 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+ # Temporary file for partial code execution
204
+ tempCodeRunnerFile.py
205
+
206
+ # Ruff stuff:
207
+ .ruff_cache/
208
+
209
+ # PyPI configuration file
210
+ .pypirc
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Streamlit
218
+ .streamlit/secrets.toml
219
+ /.claude/
220
+ /.perag/
221
+ /.venv/
@@ -0,0 +1,3 @@
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
@@ -0,0 +1,4 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="Encoding" addBOMForNewFiles="with NO BOM" />
4
+ </project>
@@ -0,0 +1,13 @@
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <list>
7
+ <option value="pyspark-stubs==3.0.0.dev8" />
8
+ </list>
9
+ </option>
10
+ </inspection_tool>
11
+ <inspection_tool class="ReassignedToPlainText" enabled="false" level="WARNING" enabled_by_default="false" />
12
+ </profile>
13
+ </component>
@@ -0,0 +1,6 @@
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
@@ -0,0 +1,7 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Python 3.13 virtualenv at ~/github/pet/.venv" />
5
+ </component>
6
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 virtualenv at ~/github/pet/.venv" project-jdk-type="Python SDK" />
7
+ </project>
@@ -0,0 +1,8 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/perag.iml" filepath="$PROJECT_DIR$/.idea/perag.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
@@ -0,0 +1,8 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="jdk" jdkName="Python 3.13 virtualenv at ~/github/pet/.venv" jdkType="Python SDK" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
@@ -0,0 +1,6 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
@@ -0,0 +1,82 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="AutoImportSettings">
4
+ <option name="autoReloadType" value="SELECTIVE" />
5
+ </component>
6
+ <component name="ChangeListManager">
7
+ <list default="true" id="662c9670-5acf-45d8-9761-ba46d1f48197" name="Changes" comment="">
8
+ <change afterPath="$PROJECT_DIR$/.python-version" afterDir="false" />
9
+ <change afterPath="$PROJECT_DIR$/CLAUDE.md" afterDir="false" />
10
+ <change afterPath="$PROJECT_DIR$/chunkers/__init__.py" afterDir="false" />
11
+ <change afterPath="$PROJECT_DIR$/chunkers/base.py" afterDir="false" />
12
+ <change afterPath="$PROJECT_DIR$/chunkers/docx.py" afterDir="false" />
13
+ <change afterPath="$PROJECT_DIR$/chunkers/markdown.py" afterDir="false" />
14
+ <change afterPath="$PROJECT_DIR$/chunkers/pdf.py" afterDir="false" />
15
+ <change afterPath="$PROJECT_DIR$/chunkers/registry.py" afterDir="false" />
16
+ <change afterPath="$PROJECT_DIR$/chunkers/text.py" afterDir="false" />
17
+ <change afterPath="$PROJECT_DIR$/config.example.toml" afterDir="false" />
18
+ <change afterPath="$PROJECT_DIR$/db/__init__.py" afterDir="false" />
19
+ <change afterPath="$PROJECT_DIR$/db/search.py" afterDir="false" />
20
+ <change afterPath="$PROJECT_DIR$/db/store.py" afterDir="false" />
21
+ <change afterPath="$PROJECT_DIR$/docs/chunking.md" afterDir="false" />
22
+ <change afterPath="$PROJECT_DIR$/docs/embedders.md" afterDir="false" />
23
+ <change afterPath="$PROJECT_DIR$/docs/pipeline.md" afterDir="false" />
24
+ <change afterPath="$PROJECT_DIR$/embedders/__init__.py" afterDir="false" />
25
+ <change afterPath="$PROJECT_DIR$/embedders/base.py" afterDir="false" />
26
+ <change afterPath="$PROJECT_DIR$/embedders/local.py" afterDir="false" />
27
+ <change afterPath="$PROJECT_DIR$/embedders/ollama.py" afterDir="false" />
28
+ <change afterPath="$PROJECT_DIR$/embedders/openai.py" afterDir="false" />
29
+ <change afterPath="$PROJECT_DIR$/embedders/registry.py" afterDir="false" />
30
+ <change afterPath="$PROJECT_DIR$/perag/__init__.py" afterDir="false" />
31
+ <change afterPath="$PROJECT_DIR$/perag/cli.py" afterDir="false" />
32
+ <change afterPath="$PROJECT_DIR$/perag/config.py" afterDir="false" />
33
+ <change afterPath="$PROJECT_DIR$/perag/schema.py" afterDir="false" />
34
+ <change afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
35
+ <change afterPath="$PROJECT_DIR$/skills/SKILL.md" afterDir="false" />
36
+ <change afterPath="$PROJECT_DIR$/uv.lock" afterDir="false" />
37
+ <change beforePath="$PROJECT_DIR$/.gitignore" beforeDir="false" afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
38
+ </list>
39
+ <option name="SHOW_DIALOG" value="false" />
40
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
41
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
42
+ <option name="LAST_RESOLUTION" value="IGNORE" />
43
+ </component>
44
+ <component name="Git.Settings">
45
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
46
+ </component>
47
+ <component name="ProjectColorInfo"><![CDATA[{
48
+ "associatedIndex": 3
49
+ }]]></component>
50
+ <component name="ProjectId" id="3EDonFdWD89jDVglvrZjum7PGi0" />
51
+ <component name="ProjectViewState">
52
+ <option name="hideEmptyMiddlePackages" value="true" />
53
+ <option name="showLibraryContents" value="true" />
54
+ </component>
55
+ <component name="PropertiesComponent"><![CDATA[{
56
+ "keyToString": {
57
+ "ModuleVcsDetector.initialDetectionPerformed": "true",
58
+ "RunOnceActivity.ShowReadmeOnStart": "true",
59
+ "git-widget-placeholder": "main",
60
+ "junie.onboarding.icon.badge.shown": "true",
61
+ "last_opened_file_path": "/Users/verhasp/github/perag",
62
+ "settings.editor.selected.configurable": "preferences.pluginManager"
63
+ }
64
+ }]]></component>
65
+ <component name="SharedIndexes">
66
+ <attachedChunks>
67
+ <set>
68
+ <option value="bundled-python-sdk-9f8e2b94138c-36ea0e71a18c-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-251.26094.141" />
69
+ </set>
70
+ </attachedChunks>
71
+ </component>
72
+ <component name="TaskManager">
73
+ <task active="true" id="Default" summary="Default task">
74
+ <changelist id="662c9670-5acf-45d8-9761-ba46d1f48197" name="Changes" comment="" />
75
+ <created>1779722665327</created>
76
+ <option name="number" value="Default" />
77
+ <option name="presentableId" value="Default" />
78
+ <updated>1779722665327</updated>
79
+ </task>
80
+ <servers />
81
+ </component>
82
+ </project>
@@ -0,0 +1 @@
1
+ 3.14
perag-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,312 @@
1
+ # Perago — Personal RAG Toolkit
2
+
3
+ ## Project Overview
4
+
5
+ **Perago** (Latin: *to carry through to completion*) is a personal productivity tool for
6
+ non-developers who work with textual documents (PDF, Word, Markdown, plain text). It
7
+ provides a local, private RAG (Retrieval-Augmented Generation) pipeline that enriches
8
+ prompts with relevant context retrieved from a personal document collection.
9
+
10
+ The command-line tool is called `perag`. It is intentionally not a service — no server,
11
+ no daemon, no cloud dependency. It runs locally, stores data locally, and is invoked
12
+ from the command line or by Claude Code via a SKILL.md.
13
+
14
+ ---
15
+
16
+ ## Repository Structure
17
+
18
+ ```
19
+ perago/
20
+ ├── CLAUDE.md # This file
21
+ ├── README.md # User-facing documentation
22
+ ├── pyproject.toml # Root package definition (if monorepo build)
23
+ ├── config.example.toml # Example configuration file
24
+
25
+ ├── perag/ # Main CLI package (entry point: perag <subcommand>)
26
+ │ ├── __init__.py
27
+ │ ├── cli.py # Subcommand dispatcher (chunk/embed/ingest/query)
28
+ │ ├── config.py # Config loading from config.toml
29
+ │ └── schema.py # Shared JSON chunk schema (dataclass/TypedDict)
30
+
31
+ ├── chunkers/ # Format-aware chunkers (one module per format)
32
+ │ ├── __init__.py
33
+ │ ├── base.py # Abstract base class: Chunker.chunk(path) -> [Chunk]
34
+ │ ├── pdf.py # PDF chunking via pdfplumber
35
+ │ ├── docx.py # Word chunking via python-docx
36
+ │ ├── markdown.py # Markdown chunking via markdown-it-py
37
+ │ ├── text.py # Plain text / paragraph-aware fallback
38
+ │ └── registry.py # Maps file extension -> Chunker class
39
+
40
+ ├── embedders/ # Embedding providers (one module per provider)
41
+ │ ├── __init__.py
42
+ │ ├── base.py # Abstract base class: Embedder.embed([str]) -> [[float]]
43
+ │ ├── ollama.py # Ollama HTTP API
44
+ │ ├── openai.py # OpenAI embeddings API
45
+ │ ├── local.py # sentence-transformers (fully local, no API key)
46
+ │ └── registry.py # Maps provider name -> Embedder class
47
+
48
+ ├── db/ # sqlite-vec database layer
49
+ │ ├── __init__.py
50
+ │ ├── store.py # Schema init, upsert, meta table management
51
+ │ └── search.py # ANN query, returns top-k chunks
52
+
53
+ ├── tests/
54
+ │ ├── fixtures/ # Sample PDF, DOCX, MD, TXT files for testing
55
+ │ ├── test_chunkers.py
56
+ │ ├── test_embedders.py
57
+ │ ├── test_db.py
58
+ │ └── test_pipeline.py # End-to-end: chunk -> embed -> ingest -> query
59
+
60
+ ├── skills/
61
+ │ └── SKILL.md # Claude Code skill: how to use perag from Claude Code
62
+
63
+ └── docs/
64
+ ├── chunking.md # How chunking works per format
65
+ ├── embedders.md # Supported embedding providers and configuration
66
+ └── pipeline.md # Full pipeline walkthrough
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Subcommand Design
72
+
73
+ All subcommands read/write JSON on stdin/stdout, making the pipeline composable:
74
+
75
+ ```bash
76
+ # Full pipeline (piped)
77
+ perag chunk document.pdf | perag embed | perag ingest
78
+
79
+ # Full pipeline (with intermediate files for inspection/debugging)
80
+ perag chunk document.pdf > chunks.json
81
+ perag embed < chunks.json > chunks_embedded.json
82
+ perag ingest < chunks_embedded.json
83
+
84
+ # Query
85
+ perag query "what are the termination conditions?"
86
+ ```
87
+
88
+ ### `perag chunk <file>`
89
+ - Detects format from extension
90
+ - Dispatches to the appropriate chunker in `chunkers/`
91
+ - Outputs a JSON array of Chunk objects to stdout
92
+
93
+ ### `perag embed`
94
+ - Reads JSON array of Chunk objects from stdin
95
+ - Calls the configured embedding provider in batches
96
+ - Outputs the same JSON array with `vector` field added
97
+
98
+ ### `perag ingest`
99
+ - Reads JSON array of embedded Chunk objects from stdin
100
+ - Writes to the sqlite-vec database
101
+ - Enforces dimension and model name consistency via the `meta` table
102
+ - Upserts by `id` (re-ingesting an updated document replaces existing chunks)
103
+
104
+ ### `perag init`
105
+ - Creates `.perag/` in the current directory
106
+ - Writes a minimal `config.toml` inheriting from `~/.perag/config.toml` if it exists
107
+ - Adds `.perag/perag.db` to `.gitignore` if a `.gitignore` is present
108
+ - Safe to re-run — never overwrites an existing config
109
+
110
+ ### `perag query "<text>"`
111
+ - Embeds the query text using the configured provider
112
+ - Performs ANN search against the sqlite-vec database
113
+ - Outputs top-k chunks as plain text (suitable for Claude Code context injection)
114
+ - `--json` flag outputs structured JSON instead
115
+
116
+ ---
117
+
118
+ ## JSON Chunk Schema
119
+
120
+ Every chunk flowing through the pipeline conforms to this schema:
121
+
122
+ ```json
123
+ {
124
+ "id": "contracts/nda_2024.pdf::chunk::7",
125
+ "source": "contracts/nda_2024.pdf",
126
+ "content": "The agreement shall terminate upon 30 days written notice...",
127
+ "metadata": {
128
+ "format": "pdf",
129
+ "page": 3,
130
+ "section": "Termination"
131
+ },
132
+ "embedding_model": "nomic-embed-text",
133
+ "embedding_provider": "ollama",
134
+ "vector": [0.021, -0.134, 0.087, "..."]
135
+ }
136
+ ```
137
+
138
+ After `perag chunk`, the fields `embedding_model`, `embedding_provider`, and `vector`
139
+ are all `null`. After `perag embed` all three are populated by the embedder with its
140
+ own identity. The `metadata` fields are format-specific and optional for downstream
141
+ consumers.
142
+
143
+ ### Embedder behaviour with pre-embedded chunks
144
+
145
+ `perag embed` inspects `embedding_model` on every incoming chunk before deciding what
146
+ to do:
147
+
148
+ | `embedding_model` in chunk | Matches current config? | Action |
149
+ |---|---|---|
150
+ | `null` | — | Embed, populate all three fields |
151
+ | set | yes | Skip — pass through unchanged |
152
+ | set | no | Re-embed, overwrite vector and embedding fields |
153
+
154
+ This means re-running `perag embed` after changing providers is safe and correct —
155
+ leftover JSON files from a previous run are detected and re-embedded automatically.
156
+
157
+ ### Ingestor validation
158
+
159
+ The ingestor enforces consistency between the chunk and the database before writing:
160
+
161
+ | Chunk state | Action |
162
+ |---|---|
163
+ | `vector` is `null` | Hard error: *"chunks have no vectors — run `perag embed` first"* |
164
+ | `embedding_model` matches `meta` table | Ingest (upsert by `id`) |
165
+ | `embedding_model` differs from `meta` table | Hard error: *"embedding model mismatch — re-run `perag embed` or rebuild the database"* |
166
+
167
+ ---
168
+
169
+ ## Database Design
170
+
171
+ A single SQLite file (`perag.db`) located in `.perag/` in the current directory, or
172
+ falling back to `~/.perag/perag.db` if no local `.perag/` exists.
173
+
174
+ Tables:
175
+ - **`chunks`**: id, source, content, metadata (JSON), vector (sqlite-vec column)
176
+ - **`meta`**: embedding model name, embedding provider, vector dimensions, creation timestamp
177
+
178
+ On first `perag ingest`, the meta table is written. On subsequent ingests, both
179
+ `embedding_model` and `embedding_provider` are validated against the meta table — a
180
+ mismatch on either is a hard error with a clear message directing the user to re-embed
181
+ or rebuild the database.
182
+
183
+ ---
184
+
185
+ ## Configuration
186
+
187
+ Perago uses a local-first lookup strategy, the same pattern as `.git` and `.claude`.
188
+ The tool always checks the current directory first and falls back to the user-level
189
+ global config.
190
+
191
+ ### Lookup order
192
+
193
+ ```
194
+ ./.perag/config.toml # project-local config (may be committed)
195
+ ./.perag/perag.db # project-local database
196
+ ~/.perag/config.toml # user-level defaults (fallback)
197
+ ~/.perag/perag.db # user-level database (fallback)
198
+ ```
199
+
200
+ A researcher with three document collections simply has three directories, each with
201
+ its own `.perag/`. The `cd` is the context switch — no flags, no project names.
202
+
203
+ The global `~/.perag/config.toml` holds the user's preferred embedding provider and
204
+ model so every new project inherits sensible defaults without repeating configuration.
205
+ A project-local config only needs to override what differs from the global defaults.
206
+
207
+ ### `.gitignore` recommendation
208
+
209
+ ```
210
+ .perag/perag.db # large, machine-generated — never commit
211
+ .perag/config.toml # optional: commit if you want to share project config
212
+ ```
213
+
214
+ ### Config file format
215
+
216
+ `~/.perag/config.toml` (user-level defaults):
217
+
218
+ ```toml
219
+ [embedding]
220
+ provider = "ollama" # ollama | openai | local
221
+ model = "nomic-embed-text"
222
+ url = "http://localhost:11434" # only for ollama
223
+ # api_key = "sk-..." # only for openai
224
+ batch_size = 32
225
+
226
+ [query]
227
+ top_k = 5
228
+ output = "text" # text | json
229
+ ```
230
+
231
+ `./.perag/config.toml` (project-local override — only specify what differs):
232
+
233
+ ```toml
234
+ [embedding]
235
+ model = "mxbai-embed-large" # override model for this project only
236
+ ```
237
+
238
+ ---
239
+
240
+ ## Development Conventions
241
+
242
+ ### Python version
243
+ Python 3.11+. No older versions. Use `match` statements freely.
244
+
245
+ ### Dependencies
246
+ Managed with `uv`. Lock file committed. No unpinned dependencies in production code.
247
+
248
+ ```
249
+ pdfplumber # PDF parsing
250
+ python-docx # Word parsing
251
+ markdown-it-py # Markdown parsing
252
+ sqlite-vec # Vector search SQLite extension
253
+ sentence-transformers # Local embeddings (optional)
254
+ httpx # HTTP client for Ollama/OpenAI
255
+ tomllib # Config parsing (stdlib in 3.11+)
256
+ typer # CLI framework
257
+ rich # Terminal output formatting
258
+ ```
259
+
260
+ ### Code style
261
+ - `ruff` for linting and formatting
262
+ - Type annotations on all public functions
263
+ - Docstrings on all public classes and methods
264
+ - No global state — everything flows through config and explicit arguments
265
+
266
+ ### Testing
267
+ - `pytest` with fixtures in `tests/fixtures/`
268
+ - Each chunker must have at least one real-file test (not mocked)
269
+ - The embedder tests mock the HTTP/model calls — no network in CI
270
+ - One end-to-end pipeline test using the `local` embedder (no API key needed)
271
+
272
+ ### Adding a new chunker
273
+ 1. Create `chunkers/<format>.py` implementing `base.Chunker`
274
+ 2. Register it in `chunkers/registry.py`
275
+ 3. Add at least one fixture file to `tests/fixtures/`
276
+ 4. Add tests in `tests/test_chunkers.py`
277
+ 5. Document the chunking strategy in `docs/chunking.md`
278
+
279
+ ### Adding a new embedder
280
+ 1. Create `embedders/<provider>.py` implementing `base.Embedder`
281
+ 2. Register it in `embedders/registry.py`
282
+ 3. Add config documentation in `docs/embedders.md`
283
+
284
+ ---
285
+
286
+ ## Claude Code Integration (SKILL.md)
287
+
288
+ The `skills/SKILL.md` file teaches Claude Code how to use `perag` as a context
289
+ enrichment tool. The typical workflow Claude Code should follow:
290
+
291
+ 1. When given a task that might benefit from document context, run:
292
+ `perag query "<relevant aspect of the task>"`
293
+ 2. Prepend the output to the working context before responding
294
+ 3. When new documents are provided, run the full pipeline:
295
+ `perag chunk <file> | perag embed | perag ingest`
296
+
297
+ The SKILL.md lives inside the repo so it is versioned alongside the tool itself.
298
+ Users copy or symlink it into their Claude Code skills directory.
299
+
300
+ ---
301
+
302
+ ## Non-Goals
303
+
304
+ - No web UI
305
+ - No REST API or daemon mode
306
+ - No multi-user support
307
+ - No cloud sync
308
+ - No support for source code files (use RustRAG or similar for that)
309
+ - No streaming ingestion of live data sources
310
+
311
+ This is a tool for a person with a folder of documents who wants to ask questions
312
+ across them. It should stay that simple.