docforge-cli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docforge/__init__.py +0 -0
- docforge/__main__.py +5 -0
- docforge/api.py +266 -0
- docforge/cli.py +296 -0
- docforge/config.py +99 -0
- docforge/crawlers/__init__.py +1 -0
- docforge/crawlers/confluence.py +109 -0
- docforge/crawlers/git.py +79 -0
- docforge/db.py +57 -0
- docforge/ingest.py +401 -0
- docforge/lint.py +92 -0
- docforge/mcp_server.py +188 -0
- docforge/processors/__init__.py +1 -0
- docforge/processors/chunker.py +141 -0
- docforge/processors/embedder.py +78 -0
- docforge/processors/parser.py +143 -0
- docforge/query_log.py +45 -0
- docforge/ranking.py +20 -0
- docforge/scripts/__init__.py +1 -0
- docforge/scripts/eval_search.py +226 -0
- docforge/scripts/latency_report.py +142 -0
- docforge/sources.py +46 -0
- docforge/sql/migrations/001_add_source_identifier.sql +3 -0
- docforge/sql/migrations/002_add_status_index.sql +1 -0
- docforge/sql/migrations/003_add_source_tags.sql +4 -0
- docforge/sql/migrations/004_add_query_log.sql +11 -0
- docforge/sql/migrations/005_add_query_log_user_oid.sql +2 -0
- docforge/sql/migrations/006_add_query_log_request_ms.sql +1 -0
- docforge/sql/schema.sql +29 -0
- docforge/templates/docforge.yml +11 -0
- docforge/templates/docker-compose.yml +14 -0
- docforge/templates/mcp_client.py +83 -0
- docforge/templates/sources.yml +21 -0
- docforge_cli-0.2.0.dist-info/METADATA +178 -0
- docforge_cli-0.2.0.dist-info/RECORD +39 -0
- docforge_cli-0.2.0.dist-info/WHEEL +5 -0
- docforge_cli-0.2.0.dist-info/entry_points.txt +2 -0
- docforge_cli-0.2.0.dist-info/licenses/LICENSE +21 -0
- docforge_cli-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Confluence REST API v2 page crawler with retry logic for transient errors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import logging
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
TRANSIENT_STATUS_CODES = {429, 502, 503, 504}
|
|
14
|
+
MAX_RETRIES = 3
|
|
15
|
+
BACKOFF_BASE = 2.0
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class CrawledPage:
|
|
20
|
+
page_id: str
|
|
21
|
+
title: str
|
|
22
|
+
space_key: str
|
|
23
|
+
html_content: str
|
|
24
|
+
content_hash: str
|
|
25
|
+
version: int
|
|
26
|
+
url: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def crawl_page(
|
|
30
|
+
page_id: str,
|
|
31
|
+
*,
|
|
32
|
+
base_url: str,
|
|
33
|
+
email: str,
|
|
34
|
+
api_token: str,
|
|
35
|
+
) -> CrawledPage:
|
|
36
|
+
"""Fetch a Confluence page via REST API v2 and return its content."""
|
|
37
|
+
api_url = f"{base_url}/wiki/api/v2/pages/{page_id}"
|
|
38
|
+
params = {"body-format": "storage"}
|
|
39
|
+
auth = httpx.BasicAuth(email, api_token)
|
|
40
|
+
|
|
41
|
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
42
|
+
response = await _request_with_retry(client, api_url, params=params, auth=auth)
|
|
43
|
+
|
|
44
|
+
data = response.json()
|
|
45
|
+
html_content = data.get("body", {}).get("storage", {}).get("value", "")
|
|
46
|
+
title = data.get("title", "")
|
|
47
|
+
version = data.get("version", {}).get("number", 0)
|
|
48
|
+
space_id = data.get("spaceId", "")
|
|
49
|
+
|
|
50
|
+
content_hash = hashlib.sha256(html_content.encode()).hexdigest()
|
|
51
|
+
page_url = f"{base_url}/wiki/spaces/{space_id}/pages/{page_id}"
|
|
52
|
+
|
|
53
|
+
return CrawledPage(
|
|
54
|
+
page_id=page_id,
|
|
55
|
+
title=title,
|
|
56
|
+
space_key=space_id,
|
|
57
|
+
html_content=html_content,
|
|
58
|
+
content_hash=content_hash,
|
|
59
|
+
version=version,
|
|
60
|
+
url=page_url,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
async def _request_with_retry(
|
|
65
|
+
client: httpx.AsyncClient,
|
|
66
|
+
url: str,
|
|
67
|
+
*,
|
|
68
|
+
params: dict | None = None,
|
|
69
|
+
auth: httpx.BasicAuth | None = None,
|
|
70
|
+
) -> httpx.Response:
|
|
71
|
+
"""Make an HTTP GET request with retry logic for transient failures."""
|
|
72
|
+
import asyncio
|
|
73
|
+
|
|
74
|
+
for attempt in range(MAX_RETRIES + 1):
|
|
75
|
+
try:
|
|
76
|
+
response = await client.get(url, params=params, auth=auth)
|
|
77
|
+
|
|
78
|
+
if response.status_code == 200:
|
|
79
|
+
return response
|
|
80
|
+
|
|
81
|
+
if response.status_code in TRANSIENT_STATUS_CODES:
|
|
82
|
+
retry_after = float(response.headers.get("Retry-After", BACKOFF_BASE**attempt))
|
|
83
|
+
logger.warning(
|
|
84
|
+
"Transient error %d for %s, retrying in %.1fs (attempt %d/%d)",
|
|
85
|
+
response.status_code,
|
|
86
|
+
url,
|
|
87
|
+
retry_after,
|
|
88
|
+
attempt + 1,
|
|
89
|
+
MAX_RETRIES,
|
|
90
|
+
)
|
|
91
|
+
await asyncio.sleep(retry_after)
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
# Permanent failure
|
|
95
|
+
response.raise_for_status()
|
|
96
|
+
|
|
97
|
+
except httpx.TimeoutException:
|
|
98
|
+
if attempt < MAX_RETRIES:
|
|
99
|
+
wait = BACKOFF_BASE**attempt
|
|
100
|
+
logger.warning("Timeout for %s, retrying in %.1fs", url, wait)
|
|
101
|
+
await asyncio.sleep(wait)
|
|
102
|
+
continue
|
|
103
|
+
raise
|
|
104
|
+
|
|
105
|
+
raise httpx.HTTPStatusError(
|
|
106
|
+
f"Max retries exceeded for {url}",
|
|
107
|
+
request=httpx.Request("GET", url),
|
|
108
|
+
response=response, # type: ignore[possibly-undefined]
|
|
109
|
+
)
|
docforge/crawlers/git.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Crawler for local git repository documentation files.
|
|
2
|
+
|
|
3
|
+
Reads markdown files (README.md, CLAUDE.md, docs/**/*.md) from a local
|
|
4
|
+
git repo directory. No git clone — the repo must already be on disk.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import logging
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class CrawledFile:
|
|
19
|
+
file_path: str
|
|
20
|
+
title: str
|
|
21
|
+
content: str
|
|
22
|
+
content_hash: str
|
|
23
|
+
repo_path: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def crawl_repo(
|
|
27
|
+
repo_path: str,
|
|
28
|
+
include_patterns: list[str] | None = None,
|
|
29
|
+
) -> list[CrawledFile]:
|
|
30
|
+
"""Read documentation files from a local git repository.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
repo_path: Absolute path to the repo root (e.g., "E:/MyRepo").
|
|
34
|
+
include_patterns: Glob patterns for files to include.
|
|
35
|
+
Defaults to ["README.md", "CLAUDE.md", "docs/**/*.md"].
|
|
36
|
+
"""
|
|
37
|
+
if include_patterns is None:
|
|
38
|
+
include_patterns = ["README.md", "CLAUDE.md", "docs/**/*.md"]
|
|
39
|
+
|
|
40
|
+
root = Path(repo_path)
|
|
41
|
+
if not root.is_dir():
|
|
42
|
+
logger.warning("Repo path does not exist: %s", repo_path)
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
results: list[CrawledFile] = []
|
|
46
|
+
seen: set[Path] = set()
|
|
47
|
+
|
|
48
|
+
for pattern in include_patterns:
|
|
49
|
+
for file_path in root.glob(pattern):
|
|
50
|
+
if not file_path.is_file():
|
|
51
|
+
continue
|
|
52
|
+
if file_path in seen:
|
|
53
|
+
continue
|
|
54
|
+
seen.add(file_path)
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
content = file_path.read_text(encoding="utf-8")
|
|
58
|
+
except (UnicodeDecodeError, OSError) as e:
|
|
59
|
+
logger.warning("Cannot read %s: %s", file_path, e)
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
if not content.strip():
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
relative = file_path.relative_to(root)
|
|
66
|
+
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
|
67
|
+
|
|
68
|
+
results.append(
|
|
69
|
+
CrawledFile(
|
|
70
|
+
file_path=str(relative),
|
|
71
|
+
title=str(relative),
|
|
72
|
+
content=content,
|
|
73
|
+
content_hash=content_hash,
|
|
74
|
+
repo_path=str(root),
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
logger.info("Found %d files in %s", len(results), repo_path)
|
|
79
|
+
return results
|
docforge/db.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""asyncpg connection pool + pgvector registration.
|
|
2
|
+
|
|
3
|
+
Module-level `_pool` is created lazily on first `get_pool()` call and
|
|
4
|
+
shared across all callers. `init_db()` applies the packaged schema.sql
|
|
5
|
+
and any migration scripts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncpg
|
|
11
|
+
from pgvector.asyncpg import register_vector
|
|
12
|
+
|
|
13
|
+
_pool: asyncpg.Pool | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def get_pool(database_url: str) -> asyncpg.Pool:
|
|
17
|
+
"""Return the module-level asyncpg pool, creating it on first call."""
|
|
18
|
+
global _pool
|
|
19
|
+
if _pool is None:
|
|
20
|
+
_pool = await asyncpg.create_pool(
|
|
21
|
+
database_url,
|
|
22
|
+
min_size=1,
|
|
23
|
+
max_size=5,
|
|
24
|
+
init=_init_connection,
|
|
25
|
+
)
|
|
26
|
+
return _pool
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def _init_connection(conn: asyncpg.Connection) -> None:
|
|
30
|
+
await register_vector(conn)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def close_pool() -> None:
|
|
34
|
+
"""Close and clear the module-level asyncpg pool if it exists."""
|
|
35
|
+
global _pool
|
|
36
|
+
if _pool is not None:
|
|
37
|
+
await _pool.close()
|
|
38
|
+
_pool = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
async def init_db(database_url: str) -> None:
|
|
42
|
+
"""Apply schema and migrations from the docforge package."""
|
|
43
|
+
import importlib.resources as resources
|
|
44
|
+
|
|
45
|
+
sql_dir = resources.files("docforge") / "sql"
|
|
46
|
+
schema_sql = (sql_dir / "schema.sql").read_text(encoding="utf-8")
|
|
47
|
+
|
|
48
|
+
conn = await asyncpg.connect(database_url)
|
|
49
|
+
try:
|
|
50
|
+
await conn.execute(schema_sql)
|
|
51
|
+
|
|
52
|
+
migrations_dir = sql_dir / "migrations"
|
|
53
|
+
for migration in sorted(migrations_dir.iterdir()):
|
|
54
|
+
if str(migration).endswith(".sql"):
|
|
55
|
+
await conn.execute(migration.read_text(encoding="utf-8"))
|
|
56
|
+
finally:
|
|
57
|
+
await conn.close()
|
docforge/ingest.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
"""Ingest pipeline — crawl → parse → chunk → embed → store.
|
|
2
|
+
|
|
3
|
+
`ingest_all` loads the sources list and runs the appropriate crawler for
|
|
4
|
+
each source type (Confluence page or local git repo). Per-source failures
|
|
5
|
+
are logged but do not abort the run.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Callable
|
|
13
|
+
|
|
14
|
+
import asyncpg
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
from docforge.config import Settings
|
|
18
|
+
from docforge.crawlers.confluence import crawl_page
|
|
19
|
+
from docforge.crawlers.git import crawl_repo
|
|
20
|
+
from docforge.db import get_pool
|
|
21
|
+
from docforge.processors.chunker import chunk_sections
|
|
22
|
+
from docforge.processors.embedder import Embedder
|
|
23
|
+
from docforge.processors.parser import Section, parse_confluence_html
|
|
24
|
+
from docforge.sources import (
|
|
25
|
+
ConfluenceSourceConfig,
|
|
26
|
+
GitRepoSourceConfig,
|
|
27
|
+
load_sources,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _git_source_identifier(repo_path: str, file_path: str) -> str:
|
|
34
|
+
"""Canonical identifier for a git-repo source row. Must stay in sync with
|
|
35
|
+
what _ingest_git_source INSERTs and what _purge_orphans matches against."""
|
|
36
|
+
return f"git:{repo_path}:{file_path}"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def ingest_all(
|
|
40
|
+
settings: Settings,
|
|
41
|
+
*,
|
|
42
|
+
purge_orphans: bool = False,
|
|
43
|
+
confirm: bool = False,
|
|
44
|
+
) -> None:
|
|
45
|
+
"""Run the full ingest pipeline for all configured sources.
|
|
46
|
+
|
|
47
|
+
When purge_orphans=True, after all sources have been ingested, any
|
|
48
|
+
`sources` rows whose identifier is not in the current sources.yml are
|
|
49
|
+
reported (and — if confirm=True — deleted). See _purge_orphans."""
|
|
50
|
+
sources = load_sources(settings.sources_file)
|
|
51
|
+
logger.info("Loaded %d sources from %s", len(sources), settings.sources_file)
|
|
52
|
+
|
|
53
|
+
logger.info("Loading embedding model...")
|
|
54
|
+
embedder = Embedder(settings.embedding_model, hf_token=settings.hf_token.get_secret_value())
|
|
55
|
+
|
|
56
|
+
pool = await get_pool(settings.database_url)
|
|
57
|
+
tokenizer_fn = embedder.get_tokenizer_fn()
|
|
58
|
+
|
|
59
|
+
succeeded = 0
|
|
60
|
+
failed = 0
|
|
61
|
+
failed_names: list[str] = []
|
|
62
|
+
current_identifiers: set[str] = set()
|
|
63
|
+
|
|
64
|
+
for source in sources:
|
|
65
|
+
try:
|
|
66
|
+
if isinstance(source, ConfluenceSourceConfig):
|
|
67
|
+
await _ingest_confluence_source(source, settings, pool, embedder, tokenizer_fn)
|
|
68
|
+
current_identifiers.add(source.page_id)
|
|
69
|
+
elif isinstance(source, GitRepoSourceConfig):
|
|
70
|
+
git_identifiers = await _ingest_git_source(source, pool, embedder, tokenizer_fn)
|
|
71
|
+
current_identifiers.update(git_identifiers)
|
|
72
|
+
succeeded += 1
|
|
73
|
+
except Exception:
|
|
74
|
+
failed += 1
|
|
75
|
+
failed_names.append(source.title)
|
|
76
|
+
logger.error("Failed to ingest source: %s", source.title, exc_info=True)
|
|
77
|
+
|
|
78
|
+
logger.info(
|
|
79
|
+
"Ingest complete: %d succeeded, %d failed out of %d sources",
|
|
80
|
+
succeeded,
|
|
81
|
+
failed,
|
|
82
|
+
len(sources),
|
|
83
|
+
)
|
|
84
|
+
if failed_names:
|
|
85
|
+
logger.warning("Failed sources: %s", ", ".join(failed_names))
|
|
86
|
+
|
|
87
|
+
if purge_orphans:
|
|
88
|
+
# Only purge if ALL sources ingested cleanly. A failed source would
|
|
89
|
+
# leave its identifier out of current_identifiers and get purged as
|
|
90
|
+
# an orphan — data loss. Require zero failures before allowing purge.
|
|
91
|
+
if failed > 0:
|
|
92
|
+
logger.warning(
|
|
93
|
+
"Skipping --purge-orphans: %d source(s) failed to ingest; "
|
|
94
|
+
"their identifiers would be incorrectly classified as orphans.",
|
|
95
|
+
failed,
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
await _purge_orphans(pool, current_identifiers, confirm=confirm)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def _ingest_confluence_source(
|
|
102
|
+
source: ConfluenceSourceConfig,
|
|
103
|
+
settings: Settings,
|
|
104
|
+
pool: asyncpg.Pool,
|
|
105
|
+
embedder: Embedder,
|
|
106
|
+
tokenizer_fn: Callable[[str], int],
|
|
107
|
+
) -> None:
|
|
108
|
+
"""Ingest a single Confluence page: crawl, parse HTML, chunk, embed, store."""
|
|
109
|
+
logger.info("Crawling Confluence: %s (page_id=%s)", source.title, source.page_id)
|
|
110
|
+
|
|
111
|
+
page = await crawl_page(
|
|
112
|
+
source.page_id,
|
|
113
|
+
base_url=settings.confluence_base_url,
|
|
114
|
+
email=settings.confluence_email,
|
|
115
|
+
api_token=settings.confluence_api_token.get_secret_value(),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
existing_hash = await _get_existing_hash(pool, source.page_id)
|
|
119
|
+
if existing_hash == page.content_hash:
|
|
120
|
+
logger.info("No changes detected for: %s", source.title)
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
logger.info("Parsing: %s", source.title)
|
|
124
|
+
sections = parse_confluence_html(page.html_content)
|
|
125
|
+
logger.info("Found %d sections", len(sections))
|
|
126
|
+
|
|
127
|
+
chunks = chunk_sections(sections, max_tokens=500, tokenizer_fn=tokenizer_fn)
|
|
128
|
+
logger.info("Created %d chunks", len(chunks))
|
|
129
|
+
|
|
130
|
+
if not chunks:
|
|
131
|
+
logger.warning("No chunks produced for: %s", source.title)
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
logger.info("Embedding %d chunks...", len(chunks))
|
|
135
|
+
texts = [chunk.text for chunk in chunks]
|
|
136
|
+
embeddings = embedder.embed(texts)
|
|
137
|
+
|
|
138
|
+
async with pool.acquire() as conn:
|
|
139
|
+
async with conn.transaction():
|
|
140
|
+
source_id = await conn.fetchval(
|
|
141
|
+
"""
|
|
142
|
+
INSERT INTO sources (type, url, title, confluence_page_id,
|
|
143
|
+
confluence_space_key, last_crawled_at,
|
|
144
|
+
content_hash, status, tags)
|
|
145
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7, 'active', $8)
|
|
146
|
+
ON CONFLICT (confluence_page_id)
|
|
147
|
+
DO UPDATE SET
|
|
148
|
+
title = EXCLUDED.title,
|
|
149
|
+
url = EXCLUDED.url,
|
|
150
|
+
last_crawled_at = EXCLUDED.last_crawled_at,
|
|
151
|
+
content_hash = EXCLUDED.content_hash,
|
|
152
|
+
status = 'active',
|
|
153
|
+
tags = EXCLUDED.tags
|
|
154
|
+
RETURNING id
|
|
155
|
+
""",
|
|
156
|
+
source.type,
|
|
157
|
+
page.url,
|
|
158
|
+
page.title,
|
|
159
|
+
source.page_id,
|
|
160
|
+
source.space_key,
|
|
161
|
+
datetime.now(timezone.utc),
|
|
162
|
+
page.content_hash,
|
|
163
|
+
source.tags,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
await conn.execute("DELETE FROM chunks WHERE source_id = $1", source_id)
|
|
167
|
+
|
|
168
|
+
for chunk, embedding in zip(chunks, embeddings):
|
|
169
|
+
await conn.execute(
|
|
170
|
+
"""
|
|
171
|
+
INSERT INTO chunks (source_id, chunk_index, text,
|
|
172
|
+
embedding, section_title)
|
|
173
|
+
VALUES ($1, $2, $3, $4, $5)
|
|
174
|
+
""",
|
|
175
|
+
source_id,
|
|
176
|
+
chunk.chunk_index,
|
|
177
|
+
chunk.text,
|
|
178
|
+
np.array(embedding, dtype=np.float32),
|
|
179
|
+
chunk.section_title,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
logger.info("Stored %d chunks for: %s", len(chunks), source.title)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
async def _ingest_git_source(
|
|
186
|
+
source: GitRepoSourceConfig,
|
|
187
|
+
pool: asyncpg.Pool,
|
|
188
|
+
embedder: Embedder,
|
|
189
|
+
tokenizer_fn: Callable[[str], int],
|
|
190
|
+
) -> list[str]:
|
|
191
|
+
"""Ingest documentation files from a local git repository.
|
|
192
|
+
|
|
193
|
+
Returns the list of source identifiers enumerated from the repo (one per
|
|
194
|
+
file crawled, not only those actually re-embedded). The caller can feed
|
|
195
|
+
this into _purge_orphans without re-walking the filesystem.
|
|
196
|
+
|
|
197
|
+
Raises FileNotFoundError if the configured repo path does not exist —
|
|
198
|
+
important because crawl_repo otherwise silently returns [] for a missing
|
|
199
|
+
path. A silent empty walk would let --purge-orphans delete all of the
|
|
200
|
+
repo's historical sources as "orphans" on a transient mount failure."""
|
|
201
|
+
from pathlib import Path
|
|
202
|
+
|
|
203
|
+
logger.info("Crawling git repo: %s (%s)", source.title, source.repo_path)
|
|
204
|
+
|
|
205
|
+
if not Path(source.repo_path).is_dir():
|
|
206
|
+
raise FileNotFoundError(f"Configured git repo path does not exist: {source.repo_path}")
|
|
207
|
+
|
|
208
|
+
files = crawl_repo(source.repo_path, source.include_patterns)
|
|
209
|
+
identifiers = [_git_source_identifier(source.repo_path, f.file_path) for f in files]
|
|
210
|
+
|
|
211
|
+
for file in files:
|
|
212
|
+
identifier = _git_source_identifier(source.repo_path, file.file_path)
|
|
213
|
+
|
|
214
|
+
existing_hash = await _get_hash_by_identifier(pool, identifier)
|
|
215
|
+
if existing_hash == file.content_hash:
|
|
216
|
+
logger.info("No changes: %s/%s", source.title, file.title)
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
sections = _parse_markdown(file.content)
|
|
220
|
+
chunks = chunk_sections(sections, max_tokens=500, tokenizer_fn=tokenizer_fn)
|
|
221
|
+
|
|
222
|
+
if not chunks:
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
logger.info("Embedding %d chunks for %s/%s", len(chunks), source.title, file.title)
|
|
226
|
+
texts = [chunk.text for chunk in chunks]
|
|
227
|
+
embeddings = embedder.embed(texts)
|
|
228
|
+
|
|
229
|
+
url = f"file://{source.repo_path}/{file.file_path}"
|
|
230
|
+
async with pool.acquire() as conn:
|
|
231
|
+
async with conn.transaction():
|
|
232
|
+
source_id = await conn.fetchval(
|
|
233
|
+
"""
|
|
234
|
+
INSERT INTO sources (type, url, title, source_identifier,
|
|
235
|
+
last_crawled_at, content_hash, status, tags)
|
|
236
|
+
VALUES ($1, $2, $3, $4, $5, $6, 'active', $7)
|
|
237
|
+
ON CONFLICT (source_identifier)
|
|
238
|
+
WHERE source_identifier IS NOT NULL
|
|
239
|
+
DO UPDATE SET
|
|
240
|
+
title = EXCLUDED.title,
|
|
241
|
+
last_crawled_at = EXCLUDED.last_crawled_at,
|
|
242
|
+
content_hash = EXCLUDED.content_hash,
|
|
243
|
+
status = 'active',
|
|
244
|
+
tags = EXCLUDED.tags
|
|
245
|
+
RETURNING id
|
|
246
|
+
""",
|
|
247
|
+
"git_repo",
|
|
248
|
+
url,
|
|
249
|
+
f"{source.title}/{file.title}",
|
|
250
|
+
identifier,
|
|
251
|
+
datetime.now(timezone.utc),
|
|
252
|
+
file.content_hash,
|
|
253
|
+
source.tags,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
await conn.execute("DELETE FROM chunks WHERE source_id = $1", source_id)
|
|
257
|
+
|
|
258
|
+
for chunk, embedding in zip(chunks, embeddings):
|
|
259
|
+
await conn.execute(
|
|
260
|
+
"""
|
|
261
|
+
INSERT INTO chunks (source_id, chunk_index, text,
|
|
262
|
+
embedding, section_title)
|
|
263
|
+
VALUES ($1, $2, $3, $4, $5)
|
|
264
|
+
""",
|
|
265
|
+
source_id,
|
|
266
|
+
chunk.chunk_index,
|
|
267
|
+
chunk.text,
|
|
268
|
+
np.array(embedding, dtype=np.float32),
|
|
269
|
+
chunk.section_title,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
logger.info("Stored %d chunks for: %s/%s", len(chunks), source.title, file.title)
|
|
273
|
+
|
|
274
|
+
return identifiers
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _parse_markdown(content: str) -> list[Section]:
|
|
278
|
+
"""Parse markdown content into sections by headings."""
|
|
279
|
+
sections: list[Section] = []
|
|
280
|
+
current_title = ""
|
|
281
|
+
current_level = 0
|
|
282
|
+
current_parts: list[str] = []
|
|
283
|
+
|
|
284
|
+
for line in content.split("\n"):
|
|
285
|
+
if line.startswith("#"):
|
|
286
|
+
if current_parts:
|
|
287
|
+
text = "\n".join(current_parts).strip()
|
|
288
|
+
if text:
|
|
289
|
+
sections.append(Section(title=current_title, text=text, level=current_level))
|
|
290
|
+
current_parts = []
|
|
291
|
+
|
|
292
|
+
level = len(line) - len(line.lstrip("#"))
|
|
293
|
+
current_title = line.lstrip("#").strip()
|
|
294
|
+
current_level = level
|
|
295
|
+
else:
|
|
296
|
+
current_parts.append(line)
|
|
297
|
+
|
|
298
|
+
if current_parts:
|
|
299
|
+
text = "\n".join(current_parts).strip()
|
|
300
|
+
if text:
|
|
301
|
+
sections.append(Section(title=current_title, text=text, level=current_level))
|
|
302
|
+
|
|
303
|
+
return sections
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
async def _get_existing_hash(pool: asyncpg.Pool, page_id: str) -> str | None:
|
|
307
|
+
"""Get the content hash of a Confluence source."""
|
|
308
|
+
async with pool.acquire() as conn:
|
|
309
|
+
return await conn.fetchval(
|
|
310
|
+
"SELECT content_hash FROM sources WHERE confluence_page_id = $1",
|
|
311
|
+
page_id,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
async def _get_hash_by_identifier(pool: asyncpg.Pool, identifier: str) -> str | None:
|
|
316
|
+
"""Get the content hash of a source by its identifier."""
|
|
317
|
+
async with pool.acquire() as conn:
|
|
318
|
+
return await conn.fetchval(
|
|
319
|
+
"SELECT content_hash FROM sources WHERE source_identifier = $1",
|
|
320
|
+
identifier,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
async def _purge_orphans(
|
|
325
|
+
pool: asyncpg.Pool,
|
|
326
|
+
current_identifiers: set[str],
|
|
327
|
+
confirm: bool,
|
|
328
|
+
) -> tuple[int, int]:
|
|
329
|
+
"""Find `sources` rows whose identifier is not in the current sources.yml,
|
|
330
|
+
report them, and (if confirm=True) delete them along with their chunks.
|
|
331
|
+
|
|
332
|
+
Identifier format:
|
|
333
|
+
- Confluence: the page_id string (e.g., "5108006937")
|
|
334
|
+
- Git: f"git:{repo_path}:{file_path}"
|
|
335
|
+
|
|
336
|
+
Returns (orphan_source_count, orphan_chunk_count). When confirm=False,
|
|
337
|
+
returns the counts of what WOULD be deleted and leaves the DB untouched.
|
|
338
|
+
|
|
339
|
+
chunks.source_id has ON DELETE CASCADE, so deleting from sources
|
|
340
|
+
cascades to chunks automatically.
|
|
341
|
+
"""
|
|
342
|
+
if not current_identifiers and confirm:
|
|
343
|
+
logger.error(
|
|
344
|
+
"_purge_orphans called with empty current_identifiers and confirm=True. "
|
|
345
|
+
"This would delete every source in the DB. Aborting — this is almost "
|
|
346
|
+
"certainly a caller bug (e.g., load_sources returned empty)."
|
|
347
|
+
)
|
|
348
|
+
return (0, 0)
|
|
349
|
+
|
|
350
|
+
async with pool.acquire() as conn:
|
|
351
|
+
async with conn.transaction():
|
|
352
|
+
rows = await conn.fetch(
|
|
353
|
+
"""
|
|
354
|
+
SELECT id,
|
|
355
|
+
title,
|
|
356
|
+
COALESCE(confluence_page_id, source_identifier) AS identifier
|
|
357
|
+
FROM sources
|
|
358
|
+
WHERE COALESCE(confluence_page_id, source_identifier) IS NOT NULL
|
|
359
|
+
"""
|
|
360
|
+
)
|
|
361
|
+
db_identifiers = {r["identifier"]: r for r in rows}
|
|
362
|
+
orphan_ids = [
|
|
363
|
+
r["id"] for ident, r in db_identifiers.items() if ident not in current_identifiers
|
|
364
|
+
]
|
|
365
|
+
|
|
366
|
+
if not orphan_ids:
|
|
367
|
+
logger.info("No orphan sources detected.")
|
|
368
|
+
return (0, 0)
|
|
369
|
+
|
|
370
|
+
chunk_count = await conn.fetchval(
|
|
371
|
+
"SELECT count(*) FROM chunks WHERE source_id = ANY($1::uuid[])",
|
|
372
|
+
orphan_ids,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
logger.info(
|
|
376
|
+
"Orphans detected: %d sources / %d chunks not in current sources.yml",
|
|
377
|
+
len(orphan_ids),
|
|
378
|
+
chunk_count,
|
|
379
|
+
)
|
|
380
|
+
for ident, r in db_identifiers.items():
|
|
381
|
+
if ident not in current_identifiers:
|
|
382
|
+
logger.debug(" orphan: %s (%s)", r["title"], ident)
|
|
383
|
+
|
|
384
|
+
if not confirm:
|
|
385
|
+
logger.info(
|
|
386
|
+
"Would delete %d orphan sources (%d chunks). Re-run with --confirm to execute.",
|
|
387
|
+
len(orphan_ids),
|
|
388
|
+
chunk_count,
|
|
389
|
+
)
|
|
390
|
+
return (len(orphan_ids), chunk_count)
|
|
391
|
+
|
|
392
|
+
await conn.execute(
|
|
393
|
+
"DELETE FROM sources WHERE id = ANY($1::uuid[])",
|
|
394
|
+
orphan_ids,
|
|
395
|
+
)
|
|
396
|
+
logger.info(
|
|
397
|
+
"Purged %d orphan sources (%d chunks).",
|
|
398
|
+
len(orphan_ids),
|
|
399
|
+
chunk_count,
|
|
400
|
+
)
|
|
401
|
+
return (len(orphan_ids), chunk_count)
|