docforge-cli 0.5.2__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/PKG-INFO +1 -1
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/pyproject.toml +1 -1
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/ingest.py +6 -4
- docforge_cli-0.6.0/src/docforge/sql/migrations/008_title_weighted_tsv.sql +42 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge_cli.egg-info/PKG-INFO +1 -1
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge_cli.egg-info/SOURCES.txt +1 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/LICENSE +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/README.md +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/setup.cfg +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/__init__.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/__main__.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/api.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/cli.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/config.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/crawlers/__init__.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/crawlers/confluence.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/crawlers/git.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/db.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/embedder_api.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/lint.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/mcp_server.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/processors/__init__.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/processors/chunker.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/processors/embedder.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/processors/parser.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/query_log.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/ranking.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/remote_client.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/scripts/__init__.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/scripts/eval_search.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/scripts/latency_report.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sources.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/001_add_source_identifier.sql +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/002_add_status_index.sql +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/003_add_source_tags.sql +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/004_add_query_log.sql +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/005_add_query_log_user_oid.sql +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/006_add_query_log_request_ms.sql +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/007_add_chunks_text_tsv.sql +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/schema.sql +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/templates/docforge.yml +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/templates/docker-compose.yml +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/templates/mcp_client.py +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/templates/sources.yml +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge_cli.egg-info/dependency_links.txt +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge_cli.egg-info/entry_points.txt +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge_cli.egg-info/requires.txt +0 -0
- {docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge_cli.egg-info/top_level.txt +0 -0
|
@@ -173,14 +173,15 @@ async def _ingest_confluence_source(
|
|
|
173
173
|
await conn.execute(
|
|
174
174
|
"""
|
|
175
175
|
INSERT INTO chunks (source_id, chunk_index, text,
|
|
176
|
-
embedding, section_title)
|
|
177
|
-
VALUES ($1, $2, $3, $4, $5)
|
|
176
|
+
embedding, section_title, title)
|
|
177
|
+
VALUES ($1, $2, $3, $4, $5, $6)
|
|
178
178
|
""",
|
|
179
179
|
source_id,
|
|
180
180
|
chunk.chunk_index,
|
|
181
181
|
chunk.text,
|
|
182
182
|
np.array(embedding, dtype=np.float32),
|
|
183
183
|
chunk.section_title,
|
|
184
|
+
source.title,
|
|
184
185
|
)
|
|
185
186
|
|
|
186
187
|
logger.info("Stored %d chunks for: %s", len(chunks), source.title)
|
|
@@ -263,14 +264,15 @@ async def _ingest_git_source(
|
|
|
263
264
|
await conn.execute(
|
|
264
265
|
"""
|
|
265
266
|
INSERT INTO chunks (source_id, chunk_index, text,
|
|
266
|
-
embedding, section_title)
|
|
267
|
-
VALUES ($1, $2, $3, $4, $5)
|
|
267
|
+
embedding, section_title, title)
|
|
268
|
+
VALUES ($1, $2, $3, $4, $5, $6)
|
|
268
269
|
""",
|
|
269
270
|
source_id,
|
|
270
271
|
chunk.chunk_index,
|
|
271
272
|
chunk.text,
|
|
272
273
|
np.array(embedding, dtype=np.float32),
|
|
273
274
|
chunk.section_title,
|
|
275
|
+
file.title,
|
|
274
276
|
)
|
|
275
277
|
|
|
276
278
|
logger.info("Stored %d chunks for: %s/%s", len(chunks), source.title, file.title)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
-- Migration 008: weighted text_tsv with title and section_title.
|
|
2
|
+
--
|
|
3
|
+
-- Replaces migration 007's plain to_tsvector('english', text) with a
|
|
4
|
+
-- three-tier weighted variant: title='A', section_title='B', text='D'.
|
|
5
|
+
-- ts_rank_cd respects setweight using its default weights array
|
|
6
|
+
-- {A: 1.0, B: 0.4, C: 0.2, D: 0.1}, so title tokens contribute ~10x a
|
|
7
|
+
-- body token in BM25-style ranking.
|
|
8
|
+
--
|
|
9
|
+
-- Postgres GENERATED ALWAYS expressions cannot be modified in place;
|
|
10
|
+
-- the column is dropped and re-created. Lock window is roughly 15-90s
|
|
11
|
+
-- on ~20k chunks (mostly the ADD COLUMN ... STORED step recomputing
|
|
12
|
+
-- three to_tsvector calls per row under AccessExclusiveLock). Acceptable
|
|
13
|
+
-- for low-volume production; revisit if corpus grows past ~1M chunks.
|
|
14
|
+
--
|
|
15
|
+
-- Idempotency: best-effort via IF [NOT] EXISTS qualifiers. Re-running
|
|
16
|
+
-- causes an unnecessary drop+recreate of text_tsv but doesn't break
|
|
17
|
+
-- anything. The migration runs once per release in practice.
|
|
18
|
+
|
|
19
|
+
-- Step 1: add the title column (idempotent).
|
|
20
|
+
ALTER TABLE chunks ADD COLUMN IF NOT EXISTS title TEXT NOT NULL DEFAULT '';
|
|
21
|
+
|
|
22
|
+
-- Step 2: backfill title from sources via JOIN UPDATE.
|
|
23
|
+
-- Only updates rows where title is still the empty default — protects
|
|
24
|
+
-- against repeat runs that would otherwise rewrite the same data.
|
|
25
|
+
UPDATE chunks
|
|
26
|
+
SET title = s.title
|
|
27
|
+
FROM sources s
|
|
28
|
+
WHERE s.id = chunks.source_id AND chunks.title = '';
|
|
29
|
+
|
|
30
|
+
-- Step 3: drop the v0.5.0 text_tsv (plain to_tsvector('english', text)).
|
|
31
|
+
ALTER TABLE chunks DROP COLUMN IF EXISTS text_tsv;
|
|
32
|
+
|
|
33
|
+
-- Step 4: re-add text_tsv with the three-tier weighted expression.
|
|
34
|
+
ALTER TABLE chunks ADD COLUMN IF NOT EXISTS text_tsv tsvector
|
|
35
|
+
GENERATED ALWAYS AS (
|
|
36
|
+
setweight(to_tsvector('english', title), 'A') ||
|
|
37
|
+
setweight(to_tsvector('english', coalesce(section_title, '')), 'B') ||
|
|
38
|
+
setweight(to_tsvector('english', text), 'D')
|
|
39
|
+
) STORED;
|
|
40
|
+
|
|
41
|
+
-- Step 5: re-create the GIN index (was dropped with the old column).
|
|
42
|
+
CREATE INDEX IF NOT EXISTS chunks_text_tsv_idx ON chunks USING GIN (text_tsv);
|
|
@@ -33,6 +33,7 @@ src/docforge/sql/migrations/004_add_query_log.sql
|
|
|
33
33
|
src/docforge/sql/migrations/005_add_query_log_user_oid.sql
|
|
34
34
|
src/docforge/sql/migrations/006_add_query_log_request_ms.sql
|
|
35
35
|
src/docforge/sql/migrations/007_add_chunks_text_tsv.sql
|
|
36
|
+
src/docforge/sql/migrations/008_title_weighted_tsv.sql
|
|
36
37
|
src/docforge/templates/docforge.yml
|
|
37
38
|
src/docforge/templates/docker-compose.yml
|
|
38
39
|
src/docforge/templates/mcp_client.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/001_add_source_identifier.sql
RENAMED
|
File without changes
|
{docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/002_add_status_index.sql
RENAMED
|
File without changes
|
{docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/003_add_source_tags.sql
RENAMED
|
File without changes
|
|
File without changes
|
{docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/005_add_query_log_user_oid.sql
RENAMED
|
File without changes
|
|
File without changes
|
{docforge_cli-0.5.2 → docforge_cli-0.6.0}/src/docforge/sql/migrations/007_add_chunks_text_tsv.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|