docforge-cli 0.4.1__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/PKG-INFO +1 -1
  2. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/pyproject.toml +1 -1
  3. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/api.py +57 -23
  4. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/config.py +17 -0
  5. docforge_cli-0.5.1/src/docforge/sql/migrations/007_add_chunks_text_tsv.sql +16 -0
  6. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge_cli.egg-info/PKG-INFO +1 -1
  7. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge_cli.egg-info/SOURCES.txt +1 -0
  8. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/LICENSE +0 -0
  9. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/README.md +0 -0
  10. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/setup.cfg +0 -0
  11. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/__init__.py +0 -0
  12. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/__main__.py +0 -0
  13. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/cli.py +0 -0
  14. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/crawlers/__init__.py +0 -0
  15. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/crawlers/confluence.py +0 -0
  16. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/crawlers/git.py +0 -0
  17. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/db.py +0 -0
  18. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/embedder_api.py +0 -0
  19. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/ingest.py +0 -0
  20. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/lint.py +0 -0
  21. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/mcp_server.py +0 -0
  22. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/processors/__init__.py +0 -0
  23. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/processors/chunker.py +0 -0
  24. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/processors/embedder.py +0 -0
  25. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/processors/parser.py +0 -0
  26. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/query_log.py +0 -0
  27. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/ranking.py +0 -0
  28. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/remote_client.py +0 -0
  29. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/scripts/__init__.py +0 -0
  30. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/scripts/eval_search.py +0 -0
  31. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/scripts/latency_report.py +0 -0
  32. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/sources.py +0 -0
  33. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/sql/migrations/001_add_source_identifier.sql +0 -0
  34. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/sql/migrations/002_add_status_index.sql +0 -0
  35. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/sql/migrations/003_add_source_tags.sql +0 -0
  36. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/sql/migrations/004_add_query_log.sql +0 -0
  37. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/sql/migrations/005_add_query_log_user_oid.sql +0 -0
  38. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/sql/migrations/006_add_query_log_request_ms.sql +0 -0
  39. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/sql/schema.sql +0 -0
  40. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/templates/docforge.yml +0 -0
  41. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/templates/docker-compose.yml +0 -0
  42. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/templates/mcp_client.py +0 -0
  43. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge/templates/sources.yml +0 -0
  44. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge_cli.egg-info/dependency_links.txt +0 -0
  45. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge_cli.egg-info/entry_points.txt +0 -0
  46. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge_cli.egg-info/requires.txt +0 -0
  47. {docforge_cli-0.4.1 → docforge_cli-0.5.1}/src/docforge_cli.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docforge-cli
3
- Version: 0.4.1
3
+ Version: 0.5.1
4
4
  Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docforge-cli"
7
- version = "0.4.1"
7
+ version = "0.5.1"
8
8
  description = "Forge searchable context from Confluence and git repos for AI coding assistants"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -209,31 +209,65 @@ async def search(
209
209
  async with pool.acquire() as conn:
210
210
  rows = await conn.fetch(
211
211
  """
212
- SELECT
213
- c.text,
214
- c.section_title,
215
- s.title AS source_title,
216
- s.url AS source_url,
217
- s.tags AS source_tags,
218
- 1 - (c.embedding <=> $1::vector) AS similarity,
219
- (1 - (c.embedding <=> $1::vector)) *
220
- (1
221
- + $2::float * cardinality(
222
- ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($3::text[]))
223
- )
224
- + $4::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
225
- ) AS boosted_score
226
- FROM chunks c
227
- JOIN sources s ON c.source_id = s.id
228
- WHERE s.status = 'active'
212
+ WITH q_tsq AS (SELECT websearch_to_tsquery($8::regconfig, $2::text) AS q),
213
+ dense AS (
214
+ SELECT id, source_id, text, section_title,
215
+ ROW_NUMBER() OVER (ORDER BY dist) AS rank
216
+ FROM (
217
+ SELECT c.id, c.source_id, c.text, c.section_title,
218
+ c.embedding <=> $1::vector AS dist
219
+ FROM chunks c JOIN sources s ON c.source_id = s.id
220
+ WHERE s.status = 'active'
221
+ ORDER BY c.embedding <=> $1::vector
222
+ LIMIT $3
223
+ ) AS t
224
+ ),
225
+ sparse AS (
226
+ SELECT id, source_id, text, section_title,
227
+ ROW_NUMBER() OVER (ORDER BY rk DESC) AS rank
228
+ FROM (
229
+ SELECT c.id, c.source_id, c.text, c.section_title,
230
+ ts_rank_cd(c.text_tsv, (SELECT q FROM q_tsq)) AS rk
231
+ FROM chunks c JOIN sources s ON c.source_id = s.id
232
+ WHERE s.status = 'active'
233
+ AND c.text_tsv @@ (SELECT q FROM q_tsq)
234
+ ORDER BY ts_rank_cd(c.text_tsv, (SELECT q FROM q_tsq)) DESC
235
+ LIMIT $3
236
+ ) AS t
237
+ ),
238
+ fused AS (
239
+ SELECT COALESCE(d.id, sp.id) AS id,
240
+ COALESCE(d.source_id, sp.source_id) AS source_id,
241
+ COALESCE(d.text, sp.text) AS text,
242
+ COALESCE(d.section_title, sp.section_title) AS section_title,
243
+ COALESCE($10::float / ($9 + d.rank), 0)
244
+ + COALESCE($11::float / ($9 + sp.rank), 0) AS rrf
245
+ FROM dense d FULL OUTER JOIN sparse sp ON d.id = sp.id
246
+ )
247
+ SELECT f.text, f.section_title,
248
+ s.title AS source_title, s.url AS source_url, s.tags AS source_tags,
249
+ f.rrf AS similarity,
250
+ f.rrf * (1
251
+ + $4::float * cardinality(
252
+ ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($5::text[]))
253
+ )
254
+ + $6::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
255
+ ) AS boosted_score
256
+ FROM fused f JOIN sources s ON f.source_id = s.id
229
257
  ORDER BY boosted_score DESC
230
- LIMIT $5
258
+ LIMIT $7
231
259
  """,
232
- np.array(query_vector, dtype=np.float32),
233
- settings.tag_match_weight,
234
- user_tags,
235
- settings.org_tag_weight,
236
- req.limit,
260
+ np.array(query_vector, dtype=np.float32), # $1
261
+ req.query, # $2
262
+ settings.hybrid_pool_size, # $3
263
+ settings.tag_match_weight, # $4
264
+ user_tags, # $5
265
+ settings.org_tag_weight, # $6
266
+ req.limit, # $7
267
+ settings.fts_language, # $8
268
+ settings.rrf_k, # $9
269
+ settings.dense_weight, # $10
270
+ settings.sparse_weight, # $11
237
271
  )
238
272
  except Exception as e:
239
273
  logger.error("Database error during search: %s", e)
@@ -66,6 +66,23 @@ class Settings(BaseSettings):
66
66
  tag_match_weight: float = 0.1
67
67
  org_tag_weight: float = 0.05
68
68
 
69
+ # Hybrid retrieval (RRF over dense + sparse). rrf_k=60 matches the universal
70
+ # default (Azure AI Search, Elasticsearch, OpenSearch); higher k flattens
71
+ # the rank distribution, lower amplifies. hybrid_pool_size is the top-N
72
+ # from each retriever feeding RRF — 4-10x req.limit is the standard rule,
73
+ # and req.limit caps at 50 so 100 covers under-recalled queries with margin.
74
+ # fts_language is the Postgres text-search config; switch to 'simple' if
75
+ # non-English content appears in the corpus.
76
+ rrf_k: int = 60
77
+ hybrid_pool_size: int = 100
78
+ fts_language: str = "english"
79
+
80
+ # Weighted RRF — multipliers on each retriever's reciprocal-rank contribution.
81
+ # Defaults at 1.0 = classic RRF (the v0.5.0 default). Tune via env var
82
+ # (DENSE_WEIGHT / SPARSE_WEIGHT) or docforge.yml; eval-driven.
83
+ dense_weight: float = 1.0
84
+ sparse_weight: float = 1.0
85
+
69
86
  # Default identity (used as CLI flag defaults when set via env/yml)
70
87
  default_user_name: str = ""
71
88
  default_team_name: str = ""
@@ -0,0 +1,16 @@
1
+ -- Migration 007: add tsvector column and GIN index for hybrid retrieval.
2
+ --
3
+ -- text_tsv is GENERATED ALWAYS AS STORED, so Postgres backfills existing
4
+ -- rows as part of the ALTER TABLE and auto-populates on every INSERT.
5
+ -- No application changes required for ingest.
6
+ --
7
+ -- The GIN index is built non-concurrently. For the current chunk count
8
+ -- (~tens of thousands) this is sub-second. If chunks grows past ~1M
9
+ -- rows, switch a future migration to CREATE INDEX CONCURRENTLY (which
10
+ -- requires running outside a transaction).
11
+
12
+ ALTER TABLE chunks
13
+ ADD COLUMN IF NOT EXISTS text_tsv tsvector
14
+ GENERATED ALWAYS AS (to_tsvector('english', text)) STORED;
15
+
16
+ CREATE INDEX IF NOT EXISTS chunks_text_tsv_idx ON chunks USING GIN (text_tsv);
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docforge-cli
3
- Version: 0.4.1
3
+ Version: 0.5.1
4
4
  Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
@@ -32,6 +32,7 @@ src/docforge/sql/migrations/003_add_source_tags.sql
32
32
  src/docforge/sql/migrations/004_add_query_log.sql
33
33
  src/docforge/sql/migrations/005_add_query_log_user_oid.sql
34
34
  src/docforge/sql/migrations/006_add_query_log_request_ms.sql
35
+ src/docforge/sql/migrations/007_add_chunks_text_tsv.sql
35
36
  src/docforge/templates/docforge.yml
36
37
  src/docforge/templates/docker-compose.yml
37
38
  src/docforge/templates/mcp_client.py
File without changes
File without changes
File without changes