docforge-cli 0.4.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/PKG-INFO +1 -1
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/pyproject.toml +1 -1
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/api.py +55 -23
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/config.py +11 -0
- docforge_cli-0.5.0/src/docforge/sql/migrations/007_add_chunks_text_tsv.sql +16 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge_cli.egg-info/PKG-INFO +1 -1
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge_cli.egg-info/SOURCES.txt +1 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/LICENSE +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/README.md +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/setup.cfg +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/__init__.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/__main__.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/cli.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/crawlers/__init__.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/crawlers/confluence.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/crawlers/git.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/db.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/embedder_api.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/ingest.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/lint.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/mcp_server.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/processors/__init__.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/processors/chunker.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/processors/embedder.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/processors/parser.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/query_log.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/ranking.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/remote_client.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/scripts/__init__.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/scripts/eval_search.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/scripts/latency_report.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sources.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/001_add_source_identifier.sql +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/002_add_status_index.sql +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/003_add_source_tags.sql +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/004_add_query_log.sql +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/005_add_query_log_user_oid.sql +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/006_add_query_log_request_ms.sql +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/schema.sql +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/templates/docforge.yml +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/templates/docker-compose.yml +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/templates/mcp_client.py +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/templates/sources.yml +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge_cli.egg-info/dependency_links.txt +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge_cli.egg-info/entry_points.txt +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge_cli.egg-info/requires.txt +0 -0
- {docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge_cli.egg-info/top_level.txt +0 -0
|
@@ -209,31 +209,63 @@ async def search(
|
|
|
209
209
|
async with pool.acquire() as conn:
|
|
210
210
|
rows = await conn.fetch(
|
|
211
211
|
"""
|
|
212
|
-
SELECT
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
212
|
+
WITH q_tsq AS (SELECT websearch_to_tsquery($8::regconfig, $2::text) AS q),
|
|
213
|
+
dense AS (
|
|
214
|
+
SELECT id, source_id, text, section_title,
|
|
215
|
+
ROW_NUMBER() OVER (ORDER BY dist) AS rank
|
|
216
|
+
FROM (
|
|
217
|
+
SELECT c.id, c.source_id, c.text, c.section_title,
|
|
218
|
+
c.embedding <=> $1::vector AS dist
|
|
219
|
+
FROM chunks c JOIN sources s ON c.source_id = s.id
|
|
220
|
+
WHERE s.status = 'active'
|
|
221
|
+
ORDER BY c.embedding <=> $1::vector
|
|
222
|
+
LIMIT $3
|
|
223
|
+
) AS t
|
|
224
|
+
),
|
|
225
|
+
sparse AS (
|
|
226
|
+
SELECT id, source_id, text, section_title,
|
|
227
|
+
ROW_NUMBER() OVER (ORDER BY rk DESC) AS rank
|
|
228
|
+
FROM (
|
|
229
|
+
SELECT c.id, c.source_id, c.text, c.section_title,
|
|
230
|
+
ts_rank_cd(c.text_tsv, (SELECT q FROM q_tsq)) AS rk
|
|
231
|
+
FROM chunks c JOIN sources s ON c.source_id = s.id
|
|
232
|
+
WHERE s.status = 'active'
|
|
233
|
+
AND c.text_tsv @@ (SELECT q FROM q_tsq)
|
|
234
|
+
ORDER BY ts_rank_cd(c.text_tsv, (SELECT q FROM q_tsq)) DESC
|
|
235
|
+
LIMIT $3
|
|
236
|
+
) AS t
|
|
237
|
+
),
|
|
238
|
+
fused AS (
|
|
239
|
+
SELECT COALESCE(d.id, sp.id) AS id,
|
|
240
|
+
COALESCE(d.source_id, sp.source_id) AS source_id,
|
|
241
|
+
COALESCE(d.text, sp.text) AS text,
|
|
242
|
+
COALESCE(d.section_title, sp.section_title) AS section_title,
|
|
243
|
+
COALESCE(1.0/($9 + d.rank), 0)
|
|
244
|
+
+ COALESCE(1.0/($9 + sp.rank), 0) AS rrf
|
|
245
|
+
FROM dense d FULL OUTER JOIN sparse sp ON d.id = sp.id
|
|
246
|
+
)
|
|
247
|
+
SELECT f.text, f.section_title,
|
|
248
|
+
s.title AS source_title, s.url AS source_url, s.tags AS source_tags,
|
|
249
|
+
f.rrf AS similarity,
|
|
250
|
+
f.rrf * (1
|
|
251
|
+
+ $4::float * cardinality(
|
|
252
|
+
ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($5::text[]))
|
|
253
|
+
)
|
|
254
|
+
+ $6::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
|
|
255
|
+
) AS boosted_score
|
|
256
|
+
FROM fused f JOIN sources s ON f.source_id = s.id
|
|
229
257
|
ORDER BY boosted_score DESC
|
|
230
|
-
LIMIT $
|
|
258
|
+
LIMIT $7
|
|
231
259
|
""",
|
|
232
|
-
np.array(query_vector, dtype=np.float32),
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
settings.
|
|
236
|
-
|
|
260
|
+
np.array(query_vector, dtype=np.float32), # $1
|
|
261
|
+
req.query, # $2
|
|
262
|
+
settings.hybrid_pool_size, # $3
|
|
263
|
+
settings.tag_match_weight, # $4
|
|
264
|
+
user_tags, # $5
|
|
265
|
+
settings.org_tag_weight, # $6
|
|
266
|
+
req.limit, # $7
|
|
267
|
+
settings.fts_language, # $8
|
|
268
|
+
settings.rrf_k, # $9
|
|
237
269
|
)
|
|
238
270
|
except Exception as e:
|
|
239
271
|
logger.error("Database error during search: %s", e)
|
|
@@ -66,6 +66,17 @@ class Settings(BaseSettings):
|
|
|
66
66
|
tag_match_weight: float = 0.1
|
|
67
67
|
org_tag_weight: float = 0.05
|
|
68
68
|
|
|
69
|
+
# Hybrid retrieval (RRF over dense + sparse). rrf_k=60 matches the universal
|
|
70
|
+
# default (Azure AI Search, Elasticsearch, OpenSearch); higher k flattens
|
|
71
|
+
# the rank distribution, lower amplifies. hybrid_pool_size is the top-N
|
|
72
|
+
# from each retriever feeding RRF — 4-10x req.limit is the standard rule,
|
|
73
|
+
# and req.limit caps at 50 so 100 covers under-recalled queries with margin.
|
|
74
|
+
# fts_language is the Postgres text-search config; switch to 'simple' if
|
|
75
|
+
# non-English content appears in the corpus.
|
|
76
|
+
rrf_k: int = 60
|
|
77
|
+
hybrid_pool_size: int = 100
|
|
78
|
+
fts_language: str = "english"
|
|
79
|
+
|
|
69
80
|
# Default identity (used as CLI flag defaults when set via env/yml)
|
|
70
81
|
default_user_name: str = ""
|
|
71
82
|
default_team_name: str = ""
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
-- Migration 007: add tsvector column and GIN index for hybrid retrieval.
|
|
2
|
+
--
|
|
3
|
+
-- text_tsv is GENERATED ALWAYS AS STORED, so Postgres backfills existing
|
|
4
|
+
-- rows as part of the ALTER TABLE and auto-populates on every INSERT.
|
|
5
|
+
-- No application changes required for ingest.
|
|
6
|
+
--
|
|
7
|
+
-- The GIN index is built non-concurrently. For the current chunk count
|
|
8
|
+
-- (~tens of thousands) this is sub-second. If chunks grows past ~1M
|
|
9
|
+
-- rows, switch a future migration to CREATE INDEX CONCURRENTLY (which
|
|
10
|
+
-- requires running outside a transaction).
|
|
11
|
+
|
|
12
|
+
ALTER TABLE chunks
|
|
13
|
+
ADD COLUMN IF NOT EXISTS text_tsv tsvector
|
|
14
|
+
GENERATED ALWAYS AS (to_tsvector('english', text)) STORED;
|
|
15
|
+
|
|
16
|
+
CREATE INDEX IF NOT EXISTS chunks_text_tsv_idx ON chunks USING GIN (text_tsv);
|
|
@@ -32,6 +32,7 @@ src/docforge/sql/migrations/003_add_source_tags.sql
|
|
|
32
32
|
src/docforge/sql/migrations/004_add_query_log.sql
|
|
33
33
|
src/docforge/sql/migrations/005_add_query_log_user_oid.sql
|
|
34
34
|
src/docforge/sql/migrations/006_add_query_log_request_ms.sql
|
|
35
|
+
src/docforge/sql/migrations/007_add_chunks_text_tsv.sql
|
|
35
36
|
src/docforge/templates/docforge.yml
|
|
36
37
|
src/docforge/templates/docker-compose.yml
|
|
37
38
|
src/docforge/templates/mcp_client.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/001_add_source_identifier.sql
RENAMED
|
File without changes
|
{docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/002_add_status_index.sql
RENAMED
|
File without changes
|
{docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/003_add_source_tags.sql
RENAMED
|
File without changes
|
|
File without changes
|
{docforge_cli-0.4.1 → docforge_cli-0.5.0}/src/docforge/sql/migrations/005_add_query_log_user_oid.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|