matrx-rag 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrx_rag-0.1.0/.gitignore +257 -0
- matrx_rag-0.1.0/CLAUDE.md +136 -0
- matrx_rag-0.1.0/PKG-INFO +71 -0
- matrx_rag-0.1.0/README.md +34 -0
- matrx_rag-0.1.0/matrx_rag/__init__.py +115 -0
- matrx_rag-0.1.0/matrx_rag/_ext.py +1167 -0
- matrx_rag-0.1.0/matrx_rag/_observability.py +872 -0
- matrx_rag-0.1.0/matrx_rag/api/__init__.py +61 -0
- matrx_rag-0.1.0/matrx_rag/api/router_rag.py +357 -0
- matrx_rag-0.1.0/matrx_rag/awareness.py +479 -0
- matrx_rag-0.1.0/matrx_rag/chunking.py +214 -0
- matrx_rag-0.1.0/matrx_rag/code_chunker.py +778 -0
- matrx_rag-0.1.0/matrx_rag/data_stores.py +453 -0
- matrx_rag-0.1.0/matrx_rag/db/__init__.py +238 -0
- matrx_rag-0.1.0/matrx_rag/db/helpers/auto_config_rag.py +38 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/__init__.py +6 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/__init__.py +14 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/data_store_members.py +204 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/data_stores.py +198 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/embedding_cache.py +192 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/embeddings_oai_3_small_1536.py +204 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/embeddings_voyage_code_3_1024.py +204 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/kg_chunk_entities.py +228 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/kg_chunks.py +270 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/kg_clusters.py +210 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/kg_edges.py +222 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/kg_entities.py +222 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/kg_entity_aliases.py +210 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/library_docs.py +192 -0
- matrx_rag-0.1.0/matrx_rag/db/managers/rag/retrieval_audit.py +204 -0
- matrx_rag-0.1.0/matrx_rag/db/models.py +380 -0
- matrx_rag-0.1.0/matrx_rag/db/models_rag.py +523 -0
- matrx_rag-0.1.0/matrx_rag/embeddings.py +437 -0
- matrx_rag-0.1.0/matrx_rag/eval/__init__.py +46 -0
- matrx_rag-0.1.0/matrx_rag/eval/golden_starter.json +70 -0
- matrx_rag-0.1.0/matrx_rag/eval/ner_golden_starter.json +86 -0
- matrx_rag-0.1.0/matrx_rag/eval/ner_runner.py +395 -0
- matrx_rag-0.1.0/matrx_rag/eval/runner.py +345 -0
- matrx_rag-0.1.0/matrx_rag/extraction_indexing.py +654 -0
- matrx_rag-0.1.0/matrx_rag/grounding.py +347 -0
- matrx_rag-0.1.0/matrx_rag/image_pipeline.py +399 -0
- matrx_rag-0.1.0/matrx_rag/ingestion.py +1677 -0
- matrx_rag-0.1.0/matrx_rag/kg_matching.py +397 -0
- matrx_rag-0.1.0/matrx_rag/library.py +262 -0
- matrx_rag-0.1.0/matrx_rag/llm.py +453 -0
- matrx_rag-0.1.0/matrx_rag/migrations/0002_rag_organization_retrofit.sql +181 -0
- matrx_rag-0.1.0/matrx_rag/migrations/0003_rag_schema.sql +452 -0
- matrx_rag-0.1.0/matrx_rag/migrations/0004_rag_per_source_acl.sql +184 -0
- matrx_rag-0.1.0/matrx_rag/migrations/0005_rag_library.sql +159 -0
- matrx_rag-0.1.0/matrx_rag/migrations/0006_kg_chunks_extraction_provenance.sql +114 -0
- matrx_rag-0.1.0/matrx_rag/migrations/0008_kg_chunks_processed_doc_fk.sql +71 -0
- matrx_rag-0.1.0/matrx_rag/migrations/0009_rag_data_stores.sql +174 -0
- matrx_rag-0.1.0/matrx_rag/migrations/0010_agx_agent_rag_awareness.sql +126 -0
- matrx_rag-0.1.0/matrx_rag/migrations/README.md +12 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_001_efficiency_layer.sql +49 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_002_universal_ingest.sql +163 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_003_seed_ner_agent.sql +108 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_004_kg_widening.sql +43 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_005_suggestion_ledger.sql +210 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_006_ner_completion_marker.sql +39 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_007_clusters.sql +99 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_007_co_occurs_unique.sql +57 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_007_seed_cluster_namer_agent.sql +95 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_008_seed_gliner_extraction_models.sql +99 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_009_seed_gliner_ner_agent.sql +109 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_012_entity_aliases.sql +58 -0
- matrx_rag-0.1.0/matrx_rag/migrations/kg_013_drop_parent_chunk_ner.sql +66 -0
- matrx_rag-0.1.0/matrx_rag/ner.py +1001 -0
- matrx_rag-0.1.0/matrx_rag/pdf_pipeline.py +1048 -0
- matrx_rag-0.1.0/matrx_rag/processed_doc_lookup.py +110 -0
- matrx_rag-0.1.0/matrx_rag/progress.py +167 -0
- matrx_rag-0.1.0/matrx_rag/repo_ingest.py +576 -0
- matrx_rag-0.1.0/matrx_rag/search.py +1091 -0
- matrx_rag-0.1.0/matrx_rag/sources.py +813 -0
- matrx_rag-0.1.0/matrx_rag/stages.py +1032 -0
- matrx_rag-0.1.0/pyproject.toml +63 -0
- matrx_rag-0.1.0/tests/raw_rag_sql_baseline.json +17 -0
- matrx_rag-0.1.0/tests/test_baseline_behavior.py +596 -0
- matrx_rag-0.1.0/tests/test_chunk_conflict.py +190 -0
- matrx_rag-0.1.0/tests/test_chunk_reuse.py +562 -0
- matrx_rag-0.1.0/tests/test_durable_embedding_writes.py +358 -0
- matrx_rag-0.1.0/tests/test_ext_seams.py +613 -0
- matrx_rag-0.1.0/tests/test_hyde_blend.py +177 -0
- matrx_rag-0.1.0/tests/test_kg_matching.py +244 -0
- matrx_rag-0.1.0/tests/test_ner.py +664 -0
- matrx_rag-0.1.0/tests/test_no_new_raw_rag_sql.py +122 -0
- matrx_rag-0.1.0/tests/test_resolvers_task_project.py +167 -0
- matrx_rag-0.1.0/tests/test_router_rag.py +522 -0
- matrx_rag-0.1.0/tests/test_unsupported_mime.py +157 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
*.pyc
|
|
2
|
+
secrets/
|
|
3
|
+
ignore/
|
|
4
|
+
temp/
|
|
5
|
+
logs/
|
|
6
|
+
todo
|
|
7
|
+
text_notes/
|
|
8
|
+
aidream/secrets/2.env
|
|
9
|
+
automation_matrix/matrix_processing/temp/*
|
|
10
|
+
cd
|
|
11
|
+
# Byte-compiled / optimized / DLL files
|
|
12
|
+
__pycache__/
|
|
13
|
+
*.py[cod]
|
|
14
|
+
*$py.class
|
|
15
|
+
|
|
16
|
+
# C extensions
|
|
17
|
+
*.so
|
|
18
|
+
.venv/
|
|
19
|
+
|
|
20
|
+
# Distribution / packaging
|
|
21
|
+
.Python
|
|
22
|
+
build/
|
|
23
|
+
develop-eggs/
|
|
24
|
+
dist/
|
|
25
|
+
downloads/
|
|
26
|
+
eggs/
|
|
27
|
+
.eggs/
|
|
28
|
+
lib/
|
|
29
|
+
lib64/
|
|
30
|
+
# The blanket lib/ rule above is from the standard Python .gitignore template
|
|
31
|
+
# and was silently swallowing TS source under the SPA `src/lib/` folders.
|
|
32
|
+
# Re-allow them explicitly so frontend builds don't ship without their lib layer.
|
|
33
|
+
!dashboard/src/lib/
|
|
34
|
+
!dashboard/src/lib/**
|
|
35
|
+
!workflow-studio/src/lib/
|
|
36
|
+
!workflow-studio/src/lib/**
|
|
37
|
+
parts/
|
|
38
|
+
sdist/
|
|
39
|
+
var/
|
|
40
|
+
wheels/
|
|
41
|
+
share/python-wheels/
|
|
42
|
+
*.egg-info/
|
|
43
|
+
.installed.cfg
|
|
44
|
+
*.egg
|
|
45
|
+
MANIFEST
|
|
46
|
+
|
|
47
|
+
# PyInstaller
|
|
48
|
+
# Usually these files are written by a python script from a template
|
|
49
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
50
|
+
*.manifest
|
|
51
|
+
*.spec
|
|
52
|
+
|
|
53
|
+
# Installer logs
|
|
54
|
+
pip-log.txt
|
|
55
|
+
pip-delete-this-directory.txt
|
|
56
|
+
|
|
57
|
+
# Unit test / coverage reports
|
|
58
|
+
ai/tests/clean_response.json
|
|
59
|
+
ai/tests/cx_storage_response.json
|
|
60
|
+
ai/tests/execution_test.py
|
|
61
|
+
ai/tests/final_response.json
|
|
62
|
+
htmlcov/
|
|
63
|
+
.tox/
|
|
64
|
+
.nox/
|
|
65
|
+
.coverage
|
|
66
|
+
.coverage.*
|
|
67
|
+
.cache
|
|
68
|
+
nosetests.xml
|
|
69
|
+
coverage.xml
|
|
70
|
+
*.cover
|
|
71
|
+
*.py,cover
|
|
72
|
+
.hypothesis/
|
|
73
|
+
.pytest_cache/
|
|
74
|
+
cover/
|
|
75
|
+
|
|
76
|
+
# Translations
|
|
77
|
+
*.mo
|
|
78
|
+
*.pot
|
|
79
|
+
|
|
80
|
+
# Django stuff:
|
|
81
|
+
*.log
|
|
82
|
+
local_settings.py
|
|
83
|
+
db.sqlite3
|
|
84
|
+
db.sqlite3-journal
|
|
85
|
+
|
|
86
|
+
# Flask stuff:
|
|
87
|
+
instance/
|
|
88
|
+
.webassets-cache
|
|
89
|
+
|
|
90
|
+
# Scrapy stuff:
|
|
91
|
+
.scrapy
|
|
92
|
+
|
|
93
|
+
# Sphinx documentation
|
|
94
|
+
docs/_build/
|
|
95
|
+
|
|
96
|
+
# PyBuilder
|
|
97
|
+
.pybuilder/
|
|
98
|
+
target/
|
|
99
|
+
|
|
100
|
+
# Jupyter Notebook
|
|
101
|
+
.ipynb_checkpoints
|
|
102
|
+
|
|
103
|
+
# IPython
|
|
104
|
+
profile_default/
|
|
105
|
+
ipython_config.py
|
|
106
|
+
|
|
107
|
+
# pyenv
|
|
108
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
109
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
110
|
+
# .python-version
|
|
111
|
+
|
|
112
|
+
# pipenv
|
|
113
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
114
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
115
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
116
|
+
# install all needed dependencies.
|
|
117
|
+
#Pipfile.lock
|
|
118
|
+
|
|
119
|
+
# poetry
|
|
120
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
121
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
122
|
+
# commonly ignored for libraries.
|
|
123
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
124
|
+
|
|
125
|
+
# pdm
|
|
126
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
127
|
+
#pdm.lock
|
|
128
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
129
|
+
# in version control.
|
|
130
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
131
|
+
.pdm.toml
|
|
132
|
+
|
|
133
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
134
|
+
__pypackages__/
|
|
135
|
+
|
|
136
|
+
# Celery stuff
|
|
137
|
+
celerybeat-schedule
|
|
138
|
+
celerybeat.pid
|
|
139
|
+
|
|
140
|
+
# SageMath parsed files
|
|
141
|
+
*.sage.py
|
|
142
|
+
|
|
143
|
+
# Environments
|
|
144
|
+
.env
|
|
145
|
+
.env_remote
|
|
146
|
+
.venv
|
|
147
|
+
env/
|
|
148
|
+
venv/
|
|
149
|
+
ENV/
|
|
150
|
+
env.bak/
|
|
151
|
+
venv.bak/
|
|
152
|
+
.env.armanonly
|
|
153
|
+
|
|
154
|
+
# Spyder project settings
|
|
155
|
+
.spyderproject
|
|
156
|
+
.spyproject
|
|
157
|
+
|
|
158
|
+
# Rope project settings
|
|
159
|
+
.ropeproject
|
|
160
|
+
|
|
161
|
+
# mkdocs documentation
|
|
162
|
+
/site
|
|
163
|
+
|
|
164
|
+
# mypy
|
|
165
|
+
.mypy_cache/
|
|
166
|
+
.dmypy.json
|
|
167
|
+
dmypy.json
|
|
168
|
+
|
|
169
|
+
# Pyre type checker
|
|
170
|
+
.pyre/
|
|
171
|
+
|
|
172
|
+
# random armani files
|
|
173
|
+
/armani_dev/secrets/
|
|
174
|
+
/armani/
|
|
175
|
+
/_armani/
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# pytype static type analyzer
|
|
180
|
+
.pytype/
|
|
181
|
+
|
|
182
|
+
# Cython debug symbols
|
|
183
|
+
cython_debug/
|
|
184
|
+
|
|
185
|
+
.idea/
|
|
186
|
+
.vscode/
|
|
187
|
+
/node_modules/
|
|
188
|
+
|
|
189
|
+
dump.rdb
|
|
190
|
+
|
|
191
|
+
frontend/
|
|
192
|
+
|
|
193
|
+
# AME Temp Files and directory structure
|
|
194
|
+
# Ignore all files in the temp directory and its subdirectories
|
|
195
|
+
/temp/**/*
|
|
196
|
+
/tmp/**/*
|
|
197
|
+
|
|
198
|
+
# Allow .gitkeep files to retain directory structure
|
|
199
|
+
!/temp/**/.gitkeep
|
|
200
|
+
!/tmp/**/.gitkeep
|
|
201
|
+
|
|
202
|
+
# Armani
|
|
203
|
+
.history*
|
|
204
|
+
.history/
|
|
205
|
+
local_data/
|
|
206
|
+
local_reports_data/
|
|
207
|
+
webscraper/quick_scrapes/temp/
|
|
208
|
+
automation_matrix/ai_apis/fireworks/_dev/*
|
|
209
|
+
automation_matrix/ai_apis/fireworks/_dev/fireworks_sample.py
|
|
210
|
+
*.pdf
|
|
211
|
+
*.flac
|
|
212
|
+
*.mp3
|
|
213
|
+
*.wav
|
|
214
|
+
miniconda.sh
|
|
215
|
+
/database/python_sql/temp_data/
|
|
216
|
+
.history*
|
|
217
|
+
.history/
|
|
218
|
+
.history/
|
|
219
|
+
|
|
220
|
+
_dev/
|
|
221
|
+
/_dev/
|
|
222
|
+
requirements_filtered.txt
|
|
223
|
+
|
|
224
|
+
# matrx-dev-tools backups
|
|
225
|
+
.env-backups/
|
|
226
|
+
# Matrx Ship config (contains API key)
|
|
227
|
+
.matrx-ship.json
|
|
228
|
+
|
|
229
|
+
# Matrx config (contains API keys)
|
|
230
|
+
.matrx.json
|
|
231
|
+
.matrx-tools.conf
|
|
232
|
+
|
|
233
|
+
# Claude Code local worktrees and per-user settings
|
|
234
|
+
.claude/worktrees/
|
|
235
|
+
.claude/settings.local.json
|
|
236
|
+
|
|
237
|
+
# Append-only snapshots from matrx_utils.update_history (unbounded; do not commit)
|
|
238
|
+
common/utils/data_in_code/data_history.json
|
|
239
|
+
packages/matrx-utils/matrx_utils/data_in_code/data_history.json
|
|
240
|
+
|
|
241
|
+
# Tool-dispatch debug logs — one file per server start, never committed
|
|
242
|
+
.matrx-debug/
|
|
243
|
+
|
|
244
|
+
# macOS Finder metadata
|
|
245
|
+
.DS_Store
|
|
246
|
+
**/.DS_Store
|
|
247
|
+
|
|
248
|
+
# Environment files
|
|
249
|
+
.env
|
|
250
|
+
.env.*
|
|
251
|
+
*.env
|
|
252
|
+
*.env.*
|
|
253
|
+
|
|
254
|
+
# Keep safe templates trackable
|
|
255
|
+
!.env.example
|
|
256
|
+
!.env.sample
|
|
257
|
+
!.env.template
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# matrx-rag — agent guide
|
|
2
|
+
|
|
3
|
+
Multi-tenant RAG extracted from `aidream/services/rag/`. Aidream is the
|
|
4
|
+
reference host, but the package is **standalone-first**: it is installed
|
|
5
|
+
directly for clients against a database with the EXACT schema the hosted
|
|
6
|
+
product uses. **The shared frontend reads that database directly — one
|
|
7
|
+
missing column or relationship and the install is useless.** Every design
|
|
8
|
+
rule below serves that invariant.
|
|
9
|
+
|
|
10
|
+
## Hard rules
|
|
11
|
+
|
|
12
|
+
1. **No imports from `aidream.*`, ever.** Host capabilities arrive through
|
|
13
|
+
`matrx_rag.configure(...)` seams (document_writer, embedding_provider,
|
|
14
|
+
llm_provider, vector_store, ner_extractor, auth_mode, error classes).
|
|
15
|
+
Adding a `from aidream.*` import here is a regression.
|
|
16
|
+
|
|
17
|
+
2. **This package owns the canonical rag.\* model classes.**
|
|
18
|
+
`matrx_rag/db/models_rag.py` (13 models) and the public.\* subset it reads
|
|
19
|
+
(`matrx_rag/db/models.py`) are GENERATED — by the host's
|
|
20
|
+
`python db/generate.py` via the `output:` entries in aidream's
|
|
21
|
+
`db/matrx_orm.yaml`. **Never hand-edit generated files**; change the live
|
|
22
|
+
schema, then regenerate. aidream imports these classes directly
|
|
23
|
+
(`from matrx_rag.db.models_rag import ...`) — it keeps NO copy and NO shim.
|
|
24
|
+
|
|
25
|
+
3. **One pool, decided by binding — the package NEVER opens its own
|
|
26
|
+
connection while hosted.** Models bake `_database = "matrx_rag"`.
|
|
27
|
+
- Hosted: `configure(db_config_name=...)` → `bind_to_host()` registers a
|
|
28
|
+
matrx-orm **name alias** onto the host's already-registered pool and
|
|
29
|
+
registers ONLY the rag.\* models (public.\* names resolve to the HOST's
|
|
30
|
+
classes via `matrx_orm.model_registry` — host always wins).
|
|
31
|
+
- Standalone: `matrx_rag.bootstrap_db()` registers the pool from
|
|
32
|
+
`MATRX_RAG_POSTGRES_*` env vars and registers BOTH model files.
|
|
33
|
+
- `configure(db_models=...)` is an OVERRIDE for a genuinely divergent
|
|
34
|
+
host DB — normal hosts must not pass it.
|
|
35
|
+
|
|
36
|
+
4. **DB access goes through `get_db_model(name)`** (`matrx_rag._ext`) —
|
|
37
|
+
resolution: injected override → `model_registry`. It always resolves once
|
|
38
|
+
configure/bootstrap ran; raising on an unconfigured app is correct.
|
|
39
|
+
`try_get_db_model` returning None means "not configured", not
|
|
40
|
+
"standalone" — the old raw-asyncpg fallback forks are gone; do not
|
|
41
|
+
reintroduce them.
|
|
42
|
+
|
|
43
|
+
5. **Raw SQL is allowed ONLY at the documented exception sites**, each marked
|
|
44
|
+
`# RAW-SQL EXCEPTION: <reason>` in code and ratcheted by
|
|
45
|
+
`tests/test_no_new_raw_rag_sql.py` (lower the baseline in the same change:
|
|
46
|
+
`REWRITE_RAW_RAG_SQL_BASELINE=1 uv run pytest ...`). The permanent set:
|
|
47
|
+
- `ingestion.upsert_chunks` / `stages.run_chunk` /
|
|
48
|
+
`extraction_indexing._upsert_derivative_chunks`: advisory-lock
|
|
49
|
+
transactions — every statement must share the lock-holding connection;
|
|
50
|
+
bulk primitives lack RETURNING ids; the columnar unnest write is
|
|
51
|
+
load-bearing (1.6k-chunk timeout incident).
|
|
52
|
+
- `search._build_visibility_clause` (8-branch OR + correlated EXISTS into
|
|
53
|
+
public ACL tables), `_vector_recall` (dynamic embedding table JOIN +
|
|
54
|
+
pgvector `<=>`), `_lexical_recall` (FTS — no ORM primitive).
|
|
55
|
+
- `ner._bump_confidence_avg` (correlated UPDATE...FROM avg — atomicity).
|
|
56
|
+
- `awareness.py` aggregates; public.\* host-table reads in
|
|
57
|
+
sources/stages/extraction_indexing/pdf_pipeline/library/data_stores
|
|
58
|
+
label-enrich (future DocumentWriter/label-resolver seam work).
|
|
59
|
+
- `db.apply_migration_file` DDL.
|
|
60
|
+
Pushing below this needs matrx-orm primitives (bulk RETURNING,
|
|
61
|
+
connection-scoped QueryBuilder writes, FTS, raw-WHERE fragments) — they
|
|
62
|
+
are tracked as feature requests, don't hand-roll around their absence.
|
|
63
|
+
|
|
64
|
+
6. **Migrations are owned by this package** (`matrx_rag/migrations/`) and
|
|
65
|
+
must reproduce the live rag schema EXACTLY (frontend-identical
|
|
66
|
+
invariant). Idempotent: every CREATE has IF NOT EXISTS, every policy
|
|
67
|
+
DROPs first, FKs into public.\* are catalog-guarded (`to_regclass`).
|
|
68
|
+
They are mirrored in aidream's `db/migrations/` — edit BOTH until the
|
|
69
|
+
package has its own runner. A standalone install targets a full
|
|
70
|
+
Matrx-schema database (the frontend needs all tables); this package
|
|
71
|
+
ships migrations for rag.\* only, models that *work against* public.\*,
|
|
72
|
+
never migrations that create public tables.
|
|
73
|
+
|
|
74
|
+
7. **Aidream sees this package via a re-export shim** at
|
|
75
|
+
`aidream/services/rag/` (models are imported directly from
|
|
76
|
+
`matrx_rag.db.models_rag` — no model shim). Every public symbol must stay
|
|
77
|
+
importable through that path — verified by `aidream/api/tests/`.
|
|
78
|
+
|
|
79
|
+
8. **Behavior parity is the contract.** Conversions must preserve exact
|
|
80
|
+
semantics: `IS NOT DISTINCT FROM` on nullable params → branch
|
|
81
|
+
`field=value` / `field__isnull=True`; never drop organization_id/owner_id
|
|
82
|
+
scoping; server-side `now()` → Python UTC only where the file already
|
|
83
|
+
does so.
|
|
84
|
+
|
|
85
|
+
## What goes where
|
|
86
|
+
|
|
87
|
+
- `matrx_rag/db/` — binding (`bootstrap_db`, `bind_to_host`, `get_pool`,
|
|
88
|
+
`PACKAGE_DB_NAME`) + generated models/managers. `get_models()` returns the
|
|
89
|
+
full name→class map.
|
|
90
|
+
- `matrx_rag/ingestion.py` — `ingest_source`, `upsert_chunks`,
|
|
91
|
+
`materialize_code_graph`. The high-level entry the host calls.
|
|
92
|
+
- `matrx_rag/chunking.py` — recursive text + line-window code chunker.
|
|
93
|
+
- `matrx_rag/code_chunker.py` — tree-sitter symbol chunker.
|
|
94
|
+
- `matrx_rag/embeddings.py` — OpenAI + Voyage adapters with batching.
|
|
95
|
+
- `matrx_rag/search.py` — hybrid retrieval, RRF fusion, MMR, rerank,
|
|
96
|
+
priority-aware ranking.
|
|
97
|
+
- `matrx_rag/pdf_pipeline.py` — page-aware extraction + OCR + LLM cleanup.
|
|
98
|
+
- `matrx_rag/image_pipeline.py` — vision-LLM captions → chunks.
|
|
99
|
+
- `matrx_rag/repo_ingest.py` — git repo walker.
|
|
100
|
+
- `matrx_rag/library.py` — library document workflow.
|
|
101
|
+
- `matrx_rag/data_stores.py` — curated content buckets.
|
|
102
|
+
- `matrx_rag/grounding.py` — answer-faithfulness verifier.
|
|
103
|
+
- `matrx_rag/stages.py` — resumable multi-step orchestrator.
|
|
104
|
+
- `matrx_rag/sources.py` — `resolve(source_kind, source_id)` adapter
|
|
105
|
+
(note / code_file / cld_file / library_doc / transcript / scraped /
|
|
106
|
+
repository / task / project).
|
|
107
|
+
- `matrx_rag/ner.py` — NER → kg_entities widening. `extract_entities(...)`
|
|
108
|
+
dedups typed entities by `(org, kind, lower(name))`, writes
|
|
109
|
+
`kg_chunk_entities` mentions + spans, emits `co_occurs_with` edges,
|
|
110
|
+
maintains `kg_entities.confidence_avg`. The extractor LLM call is
|
|
111
|
+
HOST-INJECTED (`configure(ner_extractor=...)`); no extractor ⇒ clean no-op.
|
|
112
|
+
- `matrx_rag/awareness.py` — agent RAG-awareness fragments.
|
|
113
|
+
- `matrx_rag/llm.py` — cleanup + contextualize prompts.
|
|
114
|
+
- `matrx_rag/processed_doc_lookup.py` — processed-document accessors.
|
|
115
|
+
- `matrx_rag/progress.py` — `ProgressEmitter` wrapper.
|
|
116
|
+
- `matrx_rag/extraction_indexing.py` — agent-extraction → RAG bridge.
|
|
117
|
+
- `matrx_rag/eval/` — retrieval + answer quality eval harness.
|
|
118
|
+
|
|
119
|
+
## Testing
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
uv run pytest packages/matrx-rag/tests # must pass with no aidream on the path
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
`test_no_new_raw_rag_sql.py` is the raw-SQL ratchet — it fails on ANY change
|
|
126
|
+
to the per-file counts (up = regression, down = lower the baseline in the
|
|
127
|
+
same change).
|
|
128
|
+
|
|
129
|
+
## When NOT to use this package
|
|
130
|
+
|
|
131
|
+
- aidream code that ONLY reads `processed_documents` / `processed_document_pages`
|
|
132
|
+
(those tables belong to `aidream.services.documents`, not RAG)
|
|
133
|
+
- File metadata / storage (that's matrx-utils)
|
|
134
|
+
- Anything that doesn't involve chunking + embedding + retrieval
|
|
135
|
+
|
|
136
|
+
— Verified against live code + Supabase project txzxabzwovsujtloxrus, 2026-06-09.
|
matrx_rag-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: matrx-rag
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-tenant RAG: hybrid retrieval, chunking, embeddings, ingestion, PDF + image + repo pipelines, agent-extraction indexing, priority-aware ranking.
|
|
5
|
+
Author-email: Matrx <admin@aimatrx.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: embeddings,matrx,pgvector,rag,retrieval,supabase
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Requires-Python: >=3.13
|
|
16
|
+
Requires-Dist: asyncpg>=0.30
|
|
17
|
+
Requires-Dist: cohere>=5.0
|
|
18
|
+
Requires-Dist: langchain-text-splitters>=0.3
|
|
19
|
+
Requires-Dist: matrx-orm>=3.1
|
|
20
|
+
Requires-Dist: matrx-utils
|
|
21
|
+
Requires-Dist: openai>=2.0
|
|
22
|
+
Requires-Dist: pdfplumber>=0.11
|
|
23
|
+
Requires-Dist: pydantic>=2.12
|
|
24
|
+
Requires-Dist: tiktoken>=0.8
|
|
25
|
+
Requires-Dist: voyageai>=0.3
|
|
26
|
+
Provides-Extra: api
|
|
27
|
+
Requires-Dist: fastapi>=0.115; extra == 'api'
|
|
28
|
+
Requires-Dist: matrx-connect; extra == 'api'
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: fastapi>=0.115; extra == 'dev'
|
|
31
|
+
Requires-Dist: httpx>=0.27; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
34
|
+
Provides-Extra: host
|
|
35
|
+
Requires-Dist: matrx-connect; extra == 'host'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# matrx-rag
|
|
39
|
+
|
|
40
|
+
Multi-tenant RAG: hybrid retrieval, chunking, embeddings, ingestion,
|
|
41
|
+
PDF + image + repo pipelines, agent-extraction indexing, NER/KG writes,
|
|
42
|
+
priority-aware ranking — extracted from `aidream/services/rag/` so any
|
|
43
|
+
host (aidream's cloud server, matrx-local) can consume the same stack.
|
|
44
|
+
|
|
45
|
+
**Documentation hub:** [`docs/rag_and_ner/README.md`](../../docs/rag_and_ner/README.md)
|
|
46
|
+
(Vision in `docs/knowledge/` · code truth in `docs/rag_and_ner/reality/` · backlog in `00_CLEANUP.md`.)
|
|
47
|
+
|
|
48
|
+
## What it does
|
|
49
|
+
|
|
50
|
+
- **Ingestion**: `ingest_source(source_kind, source_id, ...)` — chunk → embed → upsert; NER inline for non-code sources.
|
|
51
|
+
- **PDF / image / code repo** pipelines with provenance.
|
|
52
|
+
- **Hybrid search**: pgvector HNSW + lexical FTS → RRF → optional Cohere → MMR.
|
|
53
|
+
- **Priority-aware ranking**: agent-extracted chunks with non-zero `priority`.
|
|
54
|
+
- **Agent-extraction → RAG**: `extraction_indexing` materializes page-extraction payloads as chunks.
|
|
55
|
+
- **Data stores**: curated buckets with scoped search.
|
|
56
|
+
- **Eval harness**: retrieval + answer quality assessment.
|
|
57
|
+
|
|
58
|
+
## Host integration
|
|
59
|
+
|
|
60
|
+
- **aidream** wires the package in `aidream/package_integration.py::_configure_matrx_rag` (NER extractor, embedding cache, ORM models, search callable).
|
|
61
|
+
- **aidream/services/rag/** is a re-export shim; HTTP routes live in `aidream/api/routers/rag.py`.
|
|
62
|
+
- **Auto-ingest** (gates, budget, scope suggestions) lives in `aidream/services/auto_ingest/` — not in this package.
|
|
63
|
+
|
|
64
|
+
## Schema
|
|
65
|
+
|
|
66
|
+
Migrations: `matrx_rag/migrations/` (mirrored in `db/migrations/` until package owns the runner).
|
|
67
|
+
ORM / pgvector notes: [`docs/rag_and_ner/reality/04_ORM_AND_SCHEMA.md`](../../docs/rag_and_ner/reality/04_ORM_AND_SCHEMA.md).
|
|
68
|
+
|
|
69
|
+
## Package development
|
|
70
|
+
|
|
71
|
+
See [`CLAUDE.md`](CLAUDE.md) for injection seams, test baselines, and the no-`aidream`-imports rule.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# matrx-rag
|
|
2
|
+
|
|
3
|
+
Multi-tenant RAG: hybrid retrieval, chunking, embeddings, ingestion,
|
|
4
|
+
PDF + image + repo pipelines, agent-extraction indexing, NER/KG writes,
|
|
5
|
+
priority-aware ranking — extracted from `aidream/services/rag/` so any
|
|
6
|
+
host (aidream's cloud server, matrx-local) can consume the same stack.
|
|
7
|
+
|
|
8
|
+
**Documentation hub:** [`docs/rag_and_ner/README.md`](../../docs/rag_and_ner/README.md)
|
|
9
|
+
(Vision in `docs/knowledge/` · code truth in `docs/rag_and_ner/reality/` · backlog in `00_CLEANUP.md`.)
|
|
10
|
+
|
|
11
|
+
## What it does
|
|
12
|
+
|
|
13
|
+
- **Ingestion**: `ingest_source(source_kind, source_id, ...)` — chunk → embed → upsert; NER inline for non-code sources.
|
|
14
|
+
- **PDF / image / code repo** pipelines with provenance.
|
|
15
|
+
- **Hybrid search**: pgvector HNSW + lexical FTS → RRF → optional Cohere → MMR.
|
|
16
|
+
- **Priority-aware ranking**: agent-extracted chunks with non-zero `priority`.
|
|
17
|
+
- **Agent-extraction → RAG**: `extraction_indexing` materializes page-extraction payloads as chunks.
|
|
18
|
+
- **Data stores**: curated buckets with scoped search.
|
|
19
|
+
- **Eval harness**: retrieval + answer quality assessment.
|
|
20
|
+
|
|
21
|
+
## Host integration
|
|
22
|
+
|
|
23
|
+
- **aidream** wires the package in `aidream/package_integration.py::_configure_matrx_rag` (NER extractor, embedding cache, ORM models, search callable).
|
|
24
|
+
- **aidream/services/rag/** is a re-export shim; HTTP routes live in `aidream/api/routers/rag.py`.
|
|
25
|
+
- **Auto-ingest** (gates, budget, scope suggestions) lives in `aidream/services/auto_ingest/` — not in this package.
|
|
26
|
+
|
|
27
|
+
## Schema
|
|
28
|
+
|
|
29
|
+
Migrations: `matrx_rag/migrations/` (mirrored in `db/migrations/` until package owns the runner).
|
|
30
|
+
ORM / pgvector notes: [`docs/rag_and_ner/reality/04_ORM_AND_SCHEMA.md`](../../docs/rag_and_ner/reality/04_ORM_AND_SCHEMA.md).
|
|
31
|
+
|
|
32
|
+
## Package development
|
|
33
|
+
|
|
34
|
+
See [`CLAUDE.md`](CLAUDE.md) for injection seams, test baselines, and the no-`aidream`-imports rule.
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""matrx-rag — multi-tenant RAG for the matrx ecosystem.
|
|
2
|
+
|
|
3
|
+
Hybrid retrieval (vector + lexical), chunking, embeddings, ingestion,
|
|
4
|
+
PDF + image + repo pipelines, agent-extraction indexing, priority-
|
|
5
|
+
aware ranking. Extracted from ``aidream/services/rag/`` in Phase 2a so
|
|
6
|
+
matrx-local and aidream can share the same code with host-specific
|
|
7
|
+
provider injection.
|
|
8
|
+
|
|
9
|
+
Public API:
|
|
10
|
+
|
|
11
|
+
* :func:`configure` — wire host-supplied seams (call at startup).
|
|
12
|
+
|
|
13
|
+
* :class:`AuthMode` — enum for DB-auth posture (service_role / user_jwt / none).
|
|
14
|
+
|
|
15
|
+
* :class:`DocumentWriter` — protocol for hosts that own the
|
|
16
|
+
processed_documents schema (aidream). matrx-local uses the no-op
|
|
17
|
+
default for local-RAG mode.
|
|
18
|
+
|
|
19
|
+
* The submodules continue to expose their original public surface
|
|
20
|
+
(``matrx_rag.embeddings.embed``, ``matrx_rag.search.search``,
|
|
21
|
+
``matrx_rag.ingestion.ingest_source`` etc.). aidream's existing
|
|
22
|
+
imports keep working via the shim at ``aidream/services/rag/``.
|
|
23
|
+
|
|
24
|
+
Injection seams (all five live as of Phase 2b/iv):
|
|
25
|
+
|
|
26
|
+
* ``DocumentWriter`` — processed_documents persistence (2b/i).
|
|
27
|
+
* ``EmbeddingProvider`` — local vs cloud embeddings (2b/ii).
|
|
28
|
+
* ``LLMProvider`` — cleanup + contextualization model routing (2b/iii).
|
|
29
|
+
* ``VectorStore`` — pgvector vs sqlite-vss vs duckdb (2b/iv).
|
|
30
|
+
* ``AuthMode`` — service-role vs user-JWT (2b/i).
|
|
31
|
+
|
|
32
|
+
Call-site migration of the existing inline asyncpg paths to the
|
|
33
|
+
``VectorStore`` seam is incremental and lands in Phase 2b/iv follow-on
|
|
34
|
+
commits.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
from matrx_rag._ext import (
|
|
40
|
+
AuthMode,
|
|
41
|
+
DocumentWriter,
|
|
42
|
+
EmbeddingProvider,
|
|
43
|
+
LLMProvider,
|
|
44
|
+
NoOpDocumentWriter,
|
|
45
|
+
RagNotConfiguredError,
|
|
46
|
+
VectorStore,
|
|
47
|
+
configure,
|
|
48
|
+
get_auth_mode,
|
|
49
|
+
get_db_config_name,
|
|
50
|
+
get_db_model,
|
|
51
|
+
get_db_models,
|
|
52
|
+
get_document_writer,
|
|
53
|
+
get_embedding_provider,
|
|
54
|
+
get_ext,
|
|
55
|
+
get_llm_provider,
|
|
56
|
+
get_ner_extractor,
|
|
57
|
+
get_ner_finisher,
|
|
58
|
+
get_vector_store,
|
|
59
|
+
is_configured,
|
|
60
|
+
try_get_db_model,
|
|
61
|
+
)
|
|
62
|
+
from matrx_rag.db import (
|
|
63
|
+
PACKAGE_DB_NAME,
|
|
64
|
+
bind_to_host,
|
|
65
|
+
bootstrap_db,
|
|
66
|
+
get_models,
|
|
67
|
+
)
|
|
68
|
+
from matrx_rag._observability import (
|
|
69
|
+
AICallTally,
|
|
70
|
+
announce_chat_call,
|
|
71
|
+
announce_embedding_call,
|
|
72
|
+
announce_extraction_call,
|
|
73
|
+
print_tally_summary,
|
|
74
|
+
tally_reset,
|
|
75
|
+
tally_snapshot,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
__version__ = "0.1.0"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
__all__ = [
|
|
82
|
+
"__version__",
|
|
83
|
+
"AuthMode",
|
|
84
|
+
"DocumentWriter",
|
|
85
|
+
"EmbeddingProvider",
|
|
86
|
+
"LLMProvider",
|
|
87
|
+
"VectorStore",
|
|
88
|
+
"NoOpDocumentWriter",
|
|
89
|
+
"RagNotConfiguredError",
|
|
90
|
+
"configure",
|
|
91
|
+
"is_configured",
|
|
92
|
+
"get_ext",
|
|
93
|
+
"get_document_writer",
|
|
94
|
+
"get_embedding_provider",
|
|
95
|
+
"get_llm_provider",
|
|
96
|
+
"get_ner_extractor",
|
|
97
|
+
"get_ner_finisher",
|
|
98
|
+
"get_vector_store",
|
|
99
|
+
"get_auth_mode",
|
|
100
|
+
"get_db_config_name",
|
|
101
|
+
"get_db_model",
|
|
102
|
+
"get_db_models",
|
|
103
|
+
"try_get_db_model",
|
|
104
|
+
"PACKAGE_DB_NAME",
|
|
105
|
+
"bind_to_host",
|
|
106
|
+
"bootstrap_db",
|
|
107
|
+
"get_models",
|
|
108
|
+
"AICallTally",
|
|
109
|
+
"announce_chat_call",
|
|
110
|
+
"announce_embedding_call",
|
|
111
|
+
"announce_extraction_call",
|
|
112
|
+
"print_tally_summary",
|
|
113
|
+
"tally_reset",
|
|
114
|
+
"tally_snapshot",
|
|
115
|
+
]
|