perag 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perag-0.1.0/.gitignore +221 -0
- perag-0.1.0/.idea/.gitignore +3 -0
- perag-0.1.0/.idea/encodings.xml +4 -0
- perag-0.1.0/.idea/inspectionProfiles/Project_Default.xml +13 -0
- perag-0.1.0/.idea/inspectionProfiles/profiles_settings.xml +6 -0
- perag-0.1.0/.idea/misc.xml +7 -0
- perag-0.1.0/.idea/modules.xml +8 -0
- perag-0.1.0/.idea/perag.iml +8 -0
- perag-0.1.0/.idea/vcs.xml +6 -0
- perag-0.1.0/.idea/workspace.xml +82 -0
- perag-0.1.0/.python-version +1 -0
- perag-0.1.0/CLAUDE.md +312 -0
- perag-0.1.0/LICENSE +201 -0
- perag-0.1.0/PKG-INFO +19 -0
- perag-0.1.0/chunkers/__init__.py +0 -0
- perag-0.1.0/chunkers/base.py +10 -0
- perag-0.1.0/chunkers/docx.py +31 -0
- perag-0.1.0/chunkers/markdown.py +57 -0
- perag-0.1.0/chunkers/pdf.py +50 -0
- perag-0.1.0/chunkers/registry.py +26 -0
- perag-0.1.0/chunkers/text.py +48 -0
- perag-0.1.0/config.example.toml +10 -0
- perag-0.1.0/db/__init__.py +0 -0
- perag-0.1.0/db/search.py +38 -0
- perag-0.1.0/db/store.py +109 -0
- perag-0.1.0/docs/chunking.md +23 -0
- perag-0.1.0/docs/embedders.md +35 -0
- perag-0.1.0/docs/pipeline.md +46 -0
- perag-0.1.0/embedders/__init__.py +0 -0
- perag-0.1.0/embedders/base.py +17 -0
- perag-0.1.0/embedders/local.py +27 -0
- perag-0.1.0/embedders/ollama.py +29 -0
- perag-0.1.0/embedders/openai.py +30 -0
- perag-0.1.0/embedders/registry.py +18 -0
- perag-0.1.0/perag/__init__.py +0 -0
- perag-0.1.0/perag/cli.py +189 -0
- perag-0.1.0/perag/config.py +63 -0
- perag-0.1.0/perag/schema.py +36 -0
- perag-0.1.0/pyproject.toml +40 -0
- perag-0.1.0/release.sh +20 -0
- perag-0.1.0/skills/SKILL.md +120 -0
- perag-0.1.0/tests/__init__.py +0 -0
- perag-0.1.0/tests/fixtures/sample.md +17 -0
- perag-0.1.0/tests/fixtures/sample.txt +15 -0
- perag-0.1.0/tests/test_chunkers.py +53 -0
- perag-0.1.0/tests/test_db.py +76 -0
- perag-0.1.0/tests/test_embedders.py +64 -0
- perag-0.1.0/tests/test_pipeline.py +33 -0
- perag-0.1.0/uv.lock +1752 -0
perag-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.env
|
|
152
|
+
.envrc
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# pytype static type analyzer
|
|
179
|
+
.pytype/
|
|
180
|
+
|
|
181
|
+
# Cython debug symbols
|
|
182
|
+
cython_debug/
|
|
183
|
+
|
|
184
|
+
# PyCharm
|
|
185
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
186
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
188
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
189
|
+
# .idea/
|
|
190
|
+
|
|
191
|
+
# Abstra
|
|
192
|
+
# Abstra is an AI-powered process automation framework.
|
|
193
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
194
|
+
# Learn more at https://abstra.io/docs
|
|
195
|
+
.abstra/
|
|
196
|
+
|
|
197
|
+
# Visual Studio Code
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
199
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
201
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
202
|
+
# .vscode/
|
|
203
|
+
# Temporary file for partial code execution
|
|
204
|
+
tempCodeRunnerFile.py
|
|
205
|
+
|
|
206
|
+
# Ruff stuff:
|
|
207
|
+
.ruff_cache/
|
|
208
|
+
|
|
209
|
+
# PyPI configuration file
|
|
210
|
+
.pypirc
|
|
211
|
+
|
|
212
|
+
# Marimo
|
|
213
|
+
marimo/_static/
|
|
214
|
+
marimo/_lsp/
|
|
215
|
+
__marimo__/
|
|
216
|
+
|
|
217
|
+
# Streamlit
|
|
218
|
+
.streamlit/secrets.toml
|
|
219
|
+
/.claude/
|
|
220
|
+
/.perag/
|
|
221
|
+
/.venv/
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
<component name="InspectionProjectProfileManager">
|
|
2
|
+
<profile version="1.0">
|
|
3
|
+
<option name="myName" value="Project Default" />
|
|
4
|
+
<inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
|
|
5
|
+
<option name="ignoredPackages">
|
|
6
|
+
<list>
|
|
7
|
+
<option value="pyspark-stubs==3.0.0.dev8" />
|
|
8
|
+
</list>
|
|
9
|
+
</option>
|
|
10
|
+
</inspection_tool>
|
|
11
|
+
<inspection_tool class="ReassignedToPlainText" enabled="false" level="WARNING" enabled_by_default="false" />
|
|
12
|
+
</profile>
|
|
13
|
+
</component>
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<project version="4">
|
|
3
|
+
<component name="Black">
|
|
4
|
+
<option name="sdkName" value="Python 3.13 virtualenv at ~/github/pet/.venv" />
|
|
5
|
+
</component>
|
|
6
|
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 virtualenv at ~/github/pet/.venv" project-jdk-type="Python SDK" />
|
|
7
|
+
</project>
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<module type="PYTHON_MODULE" version="4">
|
|
3
|
+
<component name="NewModuleRootManager">
|
|
4
|
+
<content url="file://$MODULE_DIR$" />
|
|
5
|
+
<orderEntry type="jdk" jdkName="Python 3.13 virtualenv at ~/github/pet/.venv" jdkType="Python SDK" />
|
|
6
|
+
<orderEntry type="sourceFolder" forTests="false" />
|
|
7
|
+
</component>
|
|
8
|
+
</module>
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<project version="4">
|
|
3
|
+
<component name="AutoImportSettings">
|
|
4
|
+
<option name="autoReloadType" value="SELECTIVE" />
|
|
5
|
+
</component>
|
|
6
|
+
<component name="ChangeListManager">
|
|
7
|
+
<list default="true" id="662c9670-5acf-45d8-9761-ba46d1f48197" name="Changes" comment="">
|
|
8
|
+
<change afterPath="$PROJECT_DIR$/.python-version" afterDir="false" />
|
|
9
|
+
<change afterPath="$PROJECT_DIR$/CLAUDE.md" afterDir="false" />
|
|
10
|
+
<change afterPath="$PROJECT_DIR$/chunkers/__init__.py" afterDir="false" />
|
|
11
|
+
<change afterPath="$PROJECT_DIR$/chunkers/base.py" afterDir="false" />
|
|
12
|
+
<change afterPath="$PROJECT_DIR$/chunkers/docx.py" afterDir="false" />
|
|
13
|
+
<change afterPath="$PROJECT_DIR$/chunkers/markdown.py" afterDir="false" />
|
|
14
|
+
<change afterPath="$PROJECT_DIR$/chunkers/pdf.py" afterDir="false" />
|
|
15
|
+
<change afterPath="$PROJECT_DIR$/chunkers/registry.py" afterDir="false" />
|
|
16
|
+
<change afterPath="$PROJECT_DIR$/chunkers/text.py" afterDir="false" />
|
|
17
|
+
<change afterPath="$PROJECT_DIR$/config.example.toml" afterDir="false" />
|
|
18
|
+
<change afterPath="$PROJECT_DIR$/db/__init__.py" afterDir="false" />
|
|
19
|
+
<change afterPath="$PROJECT_DIR$/db/search.py" afterDir="false" />
|
|
20
|
+
<change afterPath="$PROJECT_DIR$/db/store.py" afterDir="false" />
|
|
21
|
+
<change afterPath="$PROJECT_DIR$/docs/chunking.md" afterDir="false" />
|
|
22
|
+
<change afterPath="$PROJECT_DIR$/docs/embedders.md" afterDir="false" />
|
|
23
|
+
<change afterPath="$PROJECT_DIR$/docs/pipeline.md" afterDir="false" />
|
|
24
|
+
<change afterPath="$PROJECT_DIR$/embedders/__init__.py" afterDir="false" />
|
|
25
|
+
<change afterPath="$PROJECT_DIR$/embedders/base.py" afterDir="false" />
|
|
26
|
+
<change afterPath="$PROJECT_DIR$/embedders/local.py" afterDir="false" />
|
|
27
|
+
<change afterPath="$PROJECT_DIR$/embedders/ollama.py" afterDir="false" />
|
|
28
|
+
<change afterPath="$PROJECT_DIR$/embedders/openai.py" afterDir="false" />
|
|
29
|
+
<change afterPath="$PROJECT_DIR$/embedders/registry.py" afterDir="false" />
|
|
30
|
+
<change afterPath="$PROJECT_DIR$/perag/__init__.py" afterDir="false" />
|
|
31
|
+
<change afterPath="$PROJECT_DIR$/perag/cli.py" afterDir="false" />
|
|
32
|
+
<change afterPath="$PROJECT_DIR$/perag/config.py" afterDir="false" />
|
|
33
|
+
<change afterPath="$PROJECT_DIR$/perag/schema.py" afterDir="false" />
|
|
34
|
+
<change afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
|
|
35
|
+
<change afterPath="$PROJECT_DIR$/skills/SKILL.md" afterDir="false" />
|
|
36
|
+
<change afterPath="$PROJECT_DIR$/uv.lock" afterDir="false" />
|
|
37
|
+
<change beforePath="$PROJECT_DIR$/.gitignore" beforeDir="false" afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
|
|
38
|
+
</list>
|
|
39
|
+
<option name="SHOW_DIALOG" value="false" />
|
|
40
|
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
|
41
|
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
|
42
|
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
|
43
|
+
</component>
|
|
44
|
+
<component name="Git.Settings">
|
|
45
|
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
|
46
|
+
</component>
|
|
47
|
+
<component name="ProjectColorInfo"><![CDATA[{
|
|
48
|
+
"associatedIndex": 3
|
|
49
|
+
}]]></component>
|
|
50
|
+
<component name="ProjectId" id="3EDonFdWD89jDVglvrZjum7PGi0" />
|
|
51
|
+
<component name="ProjectViewState">
|
|
52
|
+
<option name="hideEmptyMiddlePackages" value="true" />
|
|
53
|
+
<option name="showLibraryContents" value="true" />
|
|
54
|
+
</component>
|
|
55
|
+
<component name="PropertiesComponent"><![CDATA[{
|
|
56
|
+
"keyToString": {
|
|
57
|
+
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
|
58
|
+
"RunOnceActivity.ShowReadmeOnStart": "true",
|
|
59
|
+
"git-widget-placeholder": "main",
|
|
60
|
+
"junie.onboarding.icon.badge.shown": "true",
|
|
61
|
+
"last_opened_file_path": "/Users/verhasp/github/perag",
|
|
62
|
+
"settings.editor.selected.configurable": "preferences.pluginManager"
|
|
63
|
+
}
|
|
64
|
+
}]]></component>
|
|
65
|
+
<component name="SharedIndexes">
|
|
66
|
+
<attachedChunks>
|
|
67
|
+
<set>
|
|
68
|
+
<option value="bundled-python-sdk-9f8e2b94138c-36ea0e71a18c-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-251.26094.141" />
|
|
69
|
+
</set>
|
|
70
|
+
</attachedChunks>
|
|
71
|
+
</component>
|
|
72
|
+
<component name="TaskManager">
|
|
73
|
+
<task active="true" id="Default" summary="Default task">
|
|
74
|
+
<changelist id="662c9670-5acf-45d8-9761-ba46d1f48197" name="Changes" comment="" />
|
|
75
|
+
<created>1779722665327</created>
|
|
76
|
+
<option name="number" value="Default" />
|
|
77
|
+
<option name="presentableId" value="Default" />
|
|
78
|
+
<updated>1779722665327</updated>
|
|
79
|
+
</task>
|
|
80
|
+
<servers />
|
|
81
|
+
</component>
|
|
82
|
+
</project>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.14
|
perag-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
# Perago — Personal RAG Toolkit
|
|
2
|
+
|
|
3
|
+
## Project Overview
|
|
4
|
+
|
|
5
|
+
**Perago** (Latin: *to carry through to completion*) is a personal productivity tool for
|
|
6
|
+
non-developers who work with textual documents (PDF, Word, Markdown, plain text). It
|
|
7
|
+
provides a local, private RAG (Retrieval-Augmented Generation) pipeline that enriches
|
|
8
|
+
prompts with relevant context retrieved from a personal document collection.
|
|
9
|
+
|
|
10
|
+
The command-line tool is called `perag`. It is intentionally not a service — no server,
|
|
11
|
+
no daemon, no cloud dependency. It runs locally, stores data locally, and is invoked
|
|
12
|
+
from the command line or by Claude Code via a SKILL.md.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Repository Structure
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
perago/
|
|
20
|
+
├── CLAUDE.md # This file
|
|
21
|
+
├── README.md # User-facing documentation
|
|
22
|
+
├── pyproject.toml # Root package definition (if monorepo build)
|
|
23
|
+
├── config.example.toml # Example configuration file
|
|
24
|
+
│
|
|
25
|
+
├── perag/ # Main CLI package (entry point: perag <subcommand>)
|
|
26
|
+
│ ├── __init__.py
|
|
27
|
+
│ ├── cli.py # Subcommand dispatcher (chunk/embed/ingest/query)
|
|
28
|
+
│ ├── config.py # Config loading from config.toml
|
|
29
|
+
│ └── schema.py # Shared JSON chunk schema (dataclass/TypedDict)
|
|
30
|
+
│
|
|
31
|
+
├── chunkers/ # Format-aware chunkers (one module per format)
|
|
32
|
+
│ ├── __init__.py
|
|
33
|
+
│ ├── base.py # Abstract base class: Chunker.chunk(path) -> [Chunk]
|
|
34
|
+
│ ├── pdf.py # PDF chunking via pdfplumber
|
|
35
|
+
│ ├── docx.py # Word chunking via python-docx
|
|
36
|
+
│ ├── markdown.py # Markdown chunking via markdown-it-py
|
|
37
|
+
│ ├── text.py # Plain text / paragraph-aware fallback
|
|
38
|
+
│ └── registry.py # Maps file extension -> Chunker class
|
|
39
|
+
│
|
|
40
|
+
├── embedders/ # Embedding providers (one module per provider)
|
|
41
|
+
│ ├── __init__.py
|
|
42
|
+
│ ├── base.py # Abstract base class: Embedder.embed([str]) -> [[float]]
|
|
43
|
+
│ ├── ollama.py # Ollama HTTP API
|
|
44
|
+
│ ├── openai.py # OpenAI embeddings API
|
|
45
|
+
│ ├── local.py # sentence-transformers (fully local, no API key)
|
|
46
|
+
│ └── registry.py # Maps provider name -> Embedder class
|
|
47
|
+
│
|
|
48
|
+
├── db/ # sqlite-vec database layer
|
|
49
|
+
│ ├── __init__.py
|
|
50
|
+
│ ├── store.py # Schema init, upsert, meta table management
|
|
51
|
+
│ └── search.py # ANN query, returns top-k chunks
|
|
52
|
+
│
|
|
53
|
+
├── tests/
|
|
54
|
+
│ ├── fixtures/ # Sample PDF, DOCX, MD, TXT files for testing
|
|
55
|
+
│ ├── test_chunkers.py
|
|
56
|
+
│ ├── test_embedders.py
|
|
57
|
+
│ ├── test_db.py
|
|
58
|
+
│ └── test_pipeline.py # End-to-end: chunk -> embed -> ingest -> query
|
|
59
|
+
│
|
|
60
|
+
├── skills/
|
|
61
|
+
│ └── SKILL.md # Claude Code skill: how to use perag from Claude Code
|
|
62
|
+
│
|
|
63
|
+
└── docs/
|
|
64
|
+
├── chunking.md # How chunking works per format
|
|
65
|
+
├── embedders.md # Supported embedding providers and configuration
|
|
66
|
+
└── pipeline.md # Full pipeline walkthrough
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Subcommand Design
|
|
72
|
+
|
|
73
|
+
All subcommands read/write JSON on stdin/stdout, making the pipeline composable:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Full pipeline (piped)
|
|
77
|
+
perag chunk document.pdf | perag embed | perag ingest
|
|
78
|
+
|
|
79
|
+
# Full pipeline (with intermediate files for inspection/debugging)
|
|
80
|
+
perag chunk document.pdf > chunks.json
|
|
81
|
+
perag embed < chunks.json > chunks_embedded.json
|
|
82
|
+
perag ingest < chunks_embedded.json
|
|
83
|
+
|
|
84
|
+
# Query
|
|
85
|
+
perag query "what are the termination conditions?"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### `perag chunk <file>`
|
|
89
|
+
- Detects format from extension
|
|
90
|
+
- Dispatches to the appropriate chunker in `chunkers/`
|
|
91
|
+
- Outputs a JSON array of Chunk objects to stdout
|
|
92
|
+
|
|
93
|
+
### `perag embed`
|
|
94
|
+
- Reads JSON array of Chunk objects from stdin
|
|
95
|
+
- Calls the configured embedding provider in batches
|
|
96
|
+
- Outputs the same JSON array with `vector` field added
|
|
97
|
+
|
|
98
|
+
### `perag ingest`
|
|
99
|
+
- Reads JSON array of embedded Chunk objects from stdin
|
|
100
|
+
- Writes to the sqlite-vec database
|
|
101
|
+
- Enforces dimension and model name consistency via the `meta` table
|
|
102
|
+
- Upserts by `id` (re-ingesting an updated document replaces existing chunks)
|
|
103
|
+
|
|
104
|
+
### `perag init`
|
|
105
|
+
- Creates `.perag/` in the current directory
|
|
106
|
+
- Writes a minimal `config.toml` inheriting from `~/.perag/config.toml` if it exists
|
|
107
|
+
- Adds `.perag/perag.db` to `.gitignore` if a `.gitignore` is present
|
|
108
|
+
- Safe to re-run — never overwrites an existing config
|
|
109
|
+
|
|
110
|
+
### `perag query "<text>"`
|
|
111
|
+
- Embeds the query text using the configured provider
|
|
112
|
+
- Performs ANN search against the sqlite-vec database
|
|
113
|
+
- Outputs top-k chunks as plain text (suitable for Claude Code context injection)
|
|
114
|
+
- `--json` flag outputs structured JSON instead
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## JSON Chunk Schema
|
|
119
|
+
|
|
120
|
+
Every chunk flowing through the pipeline conforms to this schema:
|
|
121
|
+
|
|
122
|
+
```json
|
|
123
|
+
{
|
|
124
|
+
"id": "contracts/nda_2024.pdf::chunk::7",
|
|
125
|
+
"source": "contracts/nda_2024.pdf",
|
|
126
|
+
"content": "The agreement shall terminate upon 30 days written notice...",
|
|
127
|
+
"metadata": {
|
|
128
|
+
"format": "pdf",
|
|
129
|
+
"page": 3,
|
|
130
|
+
"section": "Termination"
|
|
131
|
+
},
|
|
132
|
+
"embedding_model": "nomic-embed-text",
|
|
133
|
+
"embedding_provider": "ollama",
|
|
134
|
+
"vector": [0.021, -0.134, 0.087, "..."]
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
After `perag chunk`, the fields `embedding_model`, `embedding_provider`, and `vector`
|
|
139
|
+
are all `null`. After `perag embed` all three are populated by the embedder with its
|
|
140
|
+
own identity. The `metadata` fields are format-specific and optional for downstream
|
|
141
|
+
consumers.
|
|
142
|
+
|
|
143
|
+
### Embedder behaviour with pre-embedded chunks
|
|
144
|
+
|
|
145
|
+
`perag embed` inspects `embedding_model` on every incoming chunk before deciding what
|
|
146
|
+
to do:
|
|
147
|
+
|
|
148
|
+
| `embedding_model` in chunk | Matches current config? | Action |
|
|
149
|
+
|---|---|---|
|
|
150
|
+
| `null` | — | Embed, populate all three fields |
|
|
151
|
+
| set | yes | Skip — pass through unchanged |
|
|
152
|
+
| set | no | Re-embed, overwrite vector and embedding fields |
|
|
153
|
+
|
|
154
|
+
This means re-running `perag embed` after changing providers is safe and correct —
|
|
155
|
+
leftover JSON files from a previous run are detected and re-embedded automatically.
|
|
156
|
+
|
|
157
|
+
### Ingestor validation
|
|
158
|
+
|
|
159
|
+
The ingestor enforces consistency between the chunk and the database before writing:
|
|
160
|
+
|
|
161
|
+
| Chunk state | Action |
|
|
162
|
+
|---|---|
|
|
163
|
+
| `vector` is `null` | Hard error: *"chunks have no vectors — run `perag embed` first"* |
|
|
164
|
+
| `embedding_model` matches `meta` table | Ingest (upsert by `id`) |
|
|
165
|
+
| `embedding_model` differs from `meta` table | Hard error: *"embedding model mismatch — re-run `perag embed` or rebuild the database"* |
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Database Design
|
|
170
|
+
|
|
171
|
+
A single SQLite file (`perag.db`) located in `.perag/` in the current directory, or
|
|
172
|
+
falling back to `~/.perag/perag.db` if no local `.perag/` exists.
|
|
173
|
+
|
|
174
|
+
Tables:
|
|
175
|
+
- **`chunks`**: id, source, content, metadata (JSON), vector (sqlite-vec column)
|
|
176
|
+
- **`meta`**: embedding model name, embedding provider, vector dimensions, creation timestamp
|
|
177
|
+
|
|
178
|
+
On first `perag ingest`, the meta table is written. On subsequent ingests, both
|
|
179
|
+
`embedding_model` and `embedding_provider` are validated against the meta table — a
|
|
180
|
+
mismatch on either is a hard error with a clear message directing the user to re-embed
|
|
181
|
+
or rebuild the database.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Configuration
|
|
186
|
+
|
|
187
|
+
Perago uses a local-first lookup strategy, the same pattern as `.git` and `.claude`.
|
|
188
|
+
The tool always checks the current directory first and falls back to the user-level
|
|
189
|
+
global config.
|
|
190
|
+
|
|
191
|
+
### Lookup order
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
./.perag/config.toml # project-local config (may be committed)
|
|
195
|
+
./.perag/perag.db # project-local database
|
|
196
|
+
~/.perag/config.toml # user-level defaults (fallback)
|
|
197
|
+
~/.perag/perag.db # user-level database (fallback)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
A researcher with three document collections simply has three directories, each with
|
|
201
|
+
its own `.perag/`. The `cd` is the context switch — no flags, no project names.
|
|
202
|
+
|
|
203
|
+
The global `~/.perag/config.toml` holds the user's preferred embedding provider and
|
|
204
|
+
model so every new project inherits sensible defaults without repeating configuration.
|
|
205
|
+
A project-local config only needs to override what differs from the global defaults.
|
|
206
|
+
|
|
207
|
+
### `.gitignore` recommendation
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
.perag/perag.db # large, machine-generated — never commit
|
|
211
|
+
.perag/config.toml # optional: commit if you want to share project config
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Config file format
|
|
215
|
+
|
|
216
|
+
`~/.perag/config.toml` (user-level defaults):
|
|
217
|
+
|
|
218
|
+
```toml
|
|
219
|
+
[embedding]
|
|
220
|
+
provider = "ollama" # ollama | openai | local
|
|
221
|
+
model = "nomic-embed-text"
|
|
222
|
+
url = "http://localhost:11434" # only for ollama
|
|
223
|
+
# api_key = "sk-..." # only for openai
|
|
224
|
+
batch_size = 32
|
|
225
|
+
|
|
226
|
+
[query]
|
|
227
|
+
top_k = 5
|
|
228
|
+
output = "text" # text | json
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
`./.perag/config.toml` (project-local override — only specify what differs):
|
|
232
|
+
|
|
233
|
+
```toml
|
|
234
|
+
[embedding]
|
|
235
|
+
model = "mxbai-embed-large" # override model for this project only
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Development Conventions
|
|
241
|
+
|
|
242
|
+
### Python version
|
|
243
|
+
Python 3.11+. No older versions. Use `match` statements freely.
|
|
244
|
+
|
|
245
|
+
### Dependencies
|
|
246
|
+
Managed with `uv`. Lock file committed. No unpinned dependencies in production code.
|
|
247
|
+
|
|
248
|
+
```
|
|
249
|
+
pdfplumber # PDF parsing
|
|
250
|
+
python-docx # Word parsing
|
|
251
|
+
markdown-it-py # Markdown parsing
|
|
252
|
+
sqlite-vec # Vector search SQLite extension
|
|
253
|
+
sentence-transformers # Local embeddings (optional)
|
|
254
|
+
httpx # HTTP client for Ollama/OpenAI
|
|
255
|
+
tomllib # Config parsing (stdlib in 3.11+)
|
|
256
|
+
typer # CLI framework
|
|
257
|
+
rich # Terminal output formatting
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Code style
|
|
261
|
+
- `ruff` for linting and formatting
|
|
262
|
+
- Type annotations on all public functions
|
|
263
|
+
- Docstrings on all public classes and methods
|
|
264
|
+
- No global state — everything flows through config and explicit arguments
|
|
265
|
+
|
|
266
|
+
### Testing
|
|
267
|
+
- `pytest` with fixtures in `tests/fixtures/`
|
|
268
|
+
- Each chunker must have at least one real-file test (not mocked)
|
|
269
|
+
- The embedder tests mock the HTTP/model calls — no network in CI
|
|
270
|
+
- One end-to-end pipeline test using the `local` embedder (no API key needed)
|
|
271
|
+
|
|
272
|
+
### Adding a new chunker
|
|
273
|
+
1. Create `chunkers/<format>.py` implementing `base.Chunker`
|
|
274
|
+
2. Register it in `chunkers/registry.py`
|
|
275
|
+
3. Add at least one fixture file to `tests/fixtures/`
|
|
276
|
+
4. Add tests in `tests/test_chunkers.py`
|
|
277
|
+
5. Document the chunking strategy in `docs/chunking.md`
|
|
278
|
+
|
|
279
|
+
### Adding a new embedder
|
|
280
|
+
1. Create `embedders/<provider>.py` implementing `base.Embedder`
|
|
281
|
+
2. Register it in `embedders/registry.py`
|
|
282
|
+
3. Add config documentation in `docs/embedders.md`
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
## Claude Code Integration (SKILL.md)
|
|
287
|
+
|
|
288
|
+
The `skills/SKILL.md` file teaches Claude Code how to use `perag` as a context
|
|
289
|
+
enrichment tool. The typical workflow Claude Code should follow:
|
|
290
|
+
|
|
291
|
+
1. When given a task that might benefit from document context, run:
|
|
292
|
+
`perag query "<relevant aspect of the task>"`
|
|
293
|
+
2. Prepend the output to the working context before responding
|
|
294
|
+
3. When new documents are provided, run the full pipeline:
|
|
295
|
+
`perag chunk <file> | perag embed | perag ingest`
|
|
296
|
+
|
|
297
|
+
The SKILL.md lives inside the repo so it is versioned alongside the tool itself.
|
|
298
|
+
Users copy or symlink it into their Claude Code skills directory.
|
|
299
|
+
|
|
300
|
+
---
|
|
301
|
+
|
|
302
|
+
## Non-Goals
|
|
303
|
+
|
|
304
|
+
- No web UI
|
|
305
|
+
- No REST API or daemon mode
|
|
306
|
+
- No multi-user support
|
|
307
|
+
- No cloud sync
|
|
308
|
+
- No support for source code files (use RustRAG or similar for that)
|
|
309
|
+
- No streaming ingestion of live data sources
|
|
310
|
+
|
|
311
|
+
This is a tool for a person with a folder of documents who wants to ask questions
|
|
312
|
+
across them. It should stay that simple.
|