diffgrab 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffgrab/__init__.py +109 -0
- diffgrab/__main__.py +132 -0
- diffgrab/db.py +167 -0
- diffgrab/differ.py +179 -0
- diffgrab/mcp_server.py +138 -0
- diffgrab/tracker.py +241 -0
- diffgrab/visual.py +181 -0
- diffgrab-0.1.0.dist-info/METADATA +237 -0
- diffgrab-0.1.0.dist-info/RECORD +12 -0
- diffgrab-0.1.0.dist-info/WHEEL +4 -0
- diffgrab-0.1.0.dist-info/entry_points.txt +3 -0
- diffgrab-0.1.0.dist-info/licenses/LICENSE +21 -0
diffgrab/__init__.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""diffgrab — Web page change tracking with structured diffs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from diffgrab.differ import DiffResult
|
|
6
|
+
from diffgrab.tracker import DiffTracker
|
|
7
|
+
|
|
8
|
+
__all__ = ["DiffTracker", "DiffResult", "track", "check", "diff", "history", "untrack"]
|
|
9
|
+
__version__ = "0.1.0"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def track(url: str, interval_hours: int = 24, *, db_path: str = "") -> str:
|
|
13
|
+
"""Register a URL for change tracking.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
url: The URL to track.
|
|
17
|
+
interval_hours: Check interval in hours (default: 24).
|
|
18
|
+
db_path: Custom database path (optional).
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Status message.
|
|
22
|
+
"""
|
|
23
|
+
kwargs = {"db_path": db_path} if db_path else {}
|
|
24
|
+
tracker = DiffTracker(**kwargs)
|
|
25
|
+
try:
|
|
26
|
+
return await tracker.track(url, interval_hours)
|
|
27
|
+
finally:
|
|
28
|
+
await tracker.close()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def check(url: str | None = None, *, db_path: str = "") -> list[DiffResult]:
|
|
32
|
+
"""Check tracked URLs for changes.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
url: Specific URL to check, or None for all.
|
|
36
|
+
db_path: Custom database path (optional).
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
List of DiffResult objects.
|
|
40
|
+
"""
|
|
41
|
+
kwargs = {"db_path": db_path} if db_path else {}
|
|
42
|
+
tracker = DiffTracker(**kwargs)
|
|
43
|
+
try:
|
|
44
|
+
return await tracker.check(url)
|
|
45
|
+
finally:
|
|
46
|
+
await tracker.close()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def diff(
|
|
50
|
+
url: str,
|
|
51
|
+
before_id: int | None = None,
|
|
52
|
+
after_id: int | None = None,
|
|
53
|
+
*,
|
|
54
|
+
db_path: str = "",
|
|
55
|
+
) -> DiffResult:
|
|
56
|
+
"""Get structured diff between two snapshots of a URL.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
url: The URL to diff.
|
|
60
|
+
before_id: Database ID of the older snapshot.
|
|
61
|
+
after_id: Database ID of the newer snapshot.
|
|
62
|
+
db_path: Custom database path (optional).
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
DiffResult with structured diff.
|
|
66
|
+
"""
|
|
67
|
+
kwargs = {"db_path": db_path} if db_path else {}
|
|
68
|
+
tracker = DiffTracker(**kwargs)
|
|
69
|
+
try:
|
|
70
|
+
return await tracker.diff(url, before_id, after_id)
|
|
71
|
+
finally:
|
|
72
|
+
await tracker.close()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def history(url: str, count: int = 10, *, db_path: str = "") -> list[dict]:
|
|
76
|
+
"""Get snapshot history for a URL.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
url: The URL to get history for.
|
|
80
|
+
count: Maximum number of snapshots (default: 10).
|
|
81
|
+
db_path: Custom database path (optional).
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of snapshot metadata dicts.
|
|
85
|
+
"""
|
|
86
|
+
kwargs = {"db_path": db_path} if db_path else {}
|
|
87
|
+
tracker = DiffTracker(**kwargs)
|
|
88
|
+
try:
|
|
89
|
+
return await tracker.history(url, count)
|
|
90
|
+
finally:
|
|
91
|
+
await tracker.close()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
async def untrack(url: str, *, db_path: str = "") -> str:
|
|
95
|
+
"""Remove a URL from tracking.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
url: The URL to untrack.
|
|
99
|
+
db_path: Custom database path (optional).
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Status message.
|
|
103
|
+
"""
|
|
104
|
+
kwargs = {"db_path": db_path} if db_path else {}
|
|
105
|
+
tracker = DiffTracker(**kwargs)
|
|
106
|
+
try:
|
|
107
|
+
return await tracker.untrack(url)
|
|
108
|
+
finally:
|
|
109
|
+
await tracker.close()
|
diffgrab/__main__.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""CLI interface for diffgrab — web page change tracking."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main() -> None:
|
|
10
|
+
"""Entry point for the diffgrab CLI."""
|
|
11
|
+
try:
|
|
12
|
+
from click import argument, group, option
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
except ImportError:
|
|
16
|
+
print("CLI dependencies not installed. Run: pip install 'diffgrab[cli]'", file=sys.stderr)
|
|
17
|
+
sys.exit(1)
|
|
18
|
+
|
|
19
|
+
console = Console()
|
|
20
|
+
|
|
21
|
+
@group()
|
|
22
|
+
def cli() -> None:
|
|
23
|
+
"""diffgrab — Web page change tracking with structured diffs."""
|
|
24
|
+
|
|
25
|
+
@cli.command()
|
|
26
|
+
@argument("url")
|
|
27
|
+
@option("--interval", default=24, type=int, help="Check interval in hours (default: 24).")
|
|
28
|
+
@option("--db", default="", help="Custom database path.")
|
|
29
|
+
def track(url: str, interval: int, db: str) -> None:
|
|
30
|
+
"""Register a URL for change tracking."""
|
|
31
|
+
from diffgrab import track as _track
|
|
32
|
+
|
|
33
|
+
kwargs = {"db_path": db} if db else {}
|
|
34
|
+
result = asyncio.run(_track(url, interval, **kwargs))
|
|
35
|
+
console.print(result)
|
|
36
|
+
|
|
37
|
+
@cli.command()
|
|
38
|
+
@argument("url", required=False, default=None)
|
|
39
|
+
@option("--db", default="", help="Custom database path.")
|
|
40
|
+
def check(url: str | None, db: str) -> None:
|
|
41
|
+
"""Check tracked URLs for changes."""
|
|
42
|
+
from diffgrab import check as _check
|
|
43
|
+
|
|
44
|
+
kwargs = {"db_path": db} if db else {}
|
|
45
|
+
results = asyncio.run(_check(url, **kwargs))
|
|
46
|
+
|
|
47
|
+
if not results:
|
|
48
|
+
console.print("[dim]No tracked URLs found.[/dim]")
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
for r in results:
|
|
52
|
+
if r.changed:
|
|
53
|
+
console.print(f"[bold red]CHANGED[/bold red] {r.url}")
|
|
54
|
+
console.print(f" +{r.added_lines} / -{r.removed_lines} lines")
|
|
55
|
+
if r.changed_sections:
|
|
56
|
+
console.print(f" Sections: {', '.join(r.changed_sections[:5])}")
|
|
57
|
+
console.print(f" {r.summary}")
|
|
58
|
+
else:
|
|
59
|
+
console.print(f"[green]OK[/green] {r.url} — {r.summary}")
|
|
60
|
+
|
|
61
|
+
@cli.command()
|
|
62
|
+
@argument("url")
|
|
63
|
+
@option("--before", "before_id", default=None, type=int, help="Snapshot ID for before.")
|
|
64
|
+
@option("--after", "after_id", default=None, type=int, help="Snapshot ID for after.")
|
|
65
|
+
@option("--db", default="", help="Custom database path.")
|
|
66
|
+
def diff(url: str, before_id: int | None, after_id: int | None, db: str) -> None:
|
|
67
|
+
"""Show structured diff between two snapshots."""
|
|
68
|
+
from diffgrab import diff as _diff
|
|
69
|
+
|
|
70
|
+
kwargs = {"db_path": db} if db else {}
|
|
71
|
+
result = asyncio.run(_diff(url, before_id, after_id, **kwargs))
|
|
72
|
+
|
|
73
|
+
if not result.changed:
|
|
74
|
+
console.print(f"[green]{result.summary}[/green]")
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
console.print(f"[bold]Diff for {result.url}[/bold]")
|
|
78
|
+
console.print(f"+{result.added_lines} / -{result.removed_lines} lines")
|
|
79
|
+
if result.changed_sections:
|
|
80
|
+
console.print(f"Sections: {', '.join(result.changed_sections)}")
|
|
81
|
+
console.print()
|
|
82
|
+
console.print(result.unified_diff)
|
|
83
|
+
|
|
84
|
+
@cli.command()
|
|
85
|
+
@argument("url")
|
|
86
|
+
@option("--count", default=10, type=int, help="Number of snapshots to show (default: 10).")
|
|
87
|
+
@option("--db", default="", help="Custom database path.")
|
|
88
|
+
def history(url: str, count: int, db: str) -> None:
|
|
89
|
+
"""Show snapshot history for a URL."""
|
|
90
|
+
from diffgrab import history as _history
|
|
91
|
+
|
|
92
|
+
kwargs = {"db_path": db} if db else {}
|
|
93
|
+
snapshots = asyncio.run(_history(url, count, **kwargs))
|
|
94
|
+
|
|
95
|
+
if not snapshots:
|
|
96
|
+
console.print(f"[dim]No snapshots for {url}[/dim]")
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
table = Table(title=f"Snapshots for {url}")
|
|
100
|
+
table.add_column("ID", style="cyan")
|
|
101
|
+
table.add_column("Title")
|
|
102
|
+
table.add_column("Words", justify="right")
|
|
103
|
+
table.add_column("Hash", max_width=12)
|
|
104
|
+
table.add_column("Captured At")
|
|
105
|
+
|
|
106
|
+
for s in snapshots:
|
|
107
|
+
table.add_row(
|
|
108
|
+
str(s["id"]),
|
|
109
|
+
s.get("title", ""),
|
|
110
|
+
str(s.get("word_count", 0)),
|
|
111
|
+
s["content_hash"][:12],
|
|
112
|
+
s["captured_at"],
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
console.print(table)
|
|
116
|
+
|
|
117
|
+
@cli.command()
|
|
118
|
+
@argument("url")
|
|
119
|
+
@option("--db", default="", help="Custom database path.")
|
|
120
|
+
def untrack(url: str, db: str) -> None:
|
|
121
|
+
"""Remove a URL from tracking."""
|
|
122
|
+
from diffgrab import untrack as _untrack
|
|
123
|
+
|
|
124
|
+
kwargs = {"db_path": db} if db else {}
|
|
125
|
+
result = asyncio.run(_untrack(url, **kwargs))
|
|
126
|
+
console.print(result)
|
|
127
|
+
|
|
128
|
+
cli()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
main()
|
diffgrab/db.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""SQLite storage for tracked URLs and snapshots."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sqlite3
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
DEFAULT_DB_PATH = "~/.local/share/diffgrab/diffgrab.db"
|
|
12
|
+
|
|
13
|
+
_SCHEMA = """\
|
|
14
|
+
CREATE TABLE IF NOT EXISTS tracked_urls (
|
|
15
|
+
id INTEGER PRIMARY KEY,
|
|
16
|
+
url TEXT UNIQUE NOT NULL,
|
|
17
|
+
interval_hours INTEGER DEFAULT 24,
|
|
18
|
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
|
19
|
+
last_checked_at TEXT
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
CREATE TABLE IF NOT EXISTS snapshots (
|
|
23
|
+
id INTEGER PRIMARY KEY,
|
|
24
|
+
url TEXT NOT NULL,
|
|
25
|
+
content_hash TEXT NOT NULL,
|
|
26
|
+
markdown TEXT NOT NULL,
|
|
27
|
+
title TEXT DEFAULT '',
|
|
28
|
+
word_count INTEGER DEFAULT 0,
|
|
29
|
+
captured_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
|
30
|
+
FOREIGN KEY (url) REFERENCES tracked_urls(url)
|
|
31
|
+
);
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Database:
|
|
36
|
+
"""SQLite database for diffgrab storage."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, db_path: str = DEFAULT_DB_PATH) -> None:
|
|
39
|
+
resolved = Path(db_path).expanduser()
|
|
40
|
+
resolved.parent.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
self._path = str(resolved)
|
|
42
|
+
self._conn: sqlite3.Connection | None = None
|
|
43
|
+
|
|
44
|
+
def _get_conn(self) -> sqlite3.Connection:
|
|
45
|
+
if self._conn is None:
|
|
46
|
+
self._conn = sqlite3.connect(self._path)
|
|
47
|
+
self._conn.row_factory = sqlite3.Row
|
|
48
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
49
|
+
self._conn.execute("PRAGMA foreign_keys=ON")
|
|
50
|
+
self._init_schema()
|
|
51
|
+
return self._conn
|
|
52
|
+
|
|
53
|
+
def _init_schema(self) -> None:
|
|
54
|
+
conn = self._conn
|
|
55
|
+
if conn is None:
|
|
56
|
+
return
|
|
57
|
+
conn.executescript(_SCHEMA)
|
|
58
|
+
conn.commit()
|
|
59
|
+
|
|
60
|
+
# ── tracked_urls ──────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
def add_tracked_url(self, url: str, interval_hours: int = 24) -> int:
|
|
63
|
+
"""Add a URL to tracking. Returns row id."""
|
|
64
|
+
conn = self._get_conn()
|
|
65
|
+
cur = conn.execute(
|
|
66
|
+
"INSERT OR IGNORE INTO tracked_urls (url, interval_hours) VALUES (?, ?)",
|
|
67
|
+
(url, interval_hours),
|
|
68
|
+
)
|
|
69
|
+
conn.commit()
|
|
70
|
+
if cur.lastrowid and cur.rowcount > 0:
|
|
71
|
+
return cur.lastrowid
|
|
72
|
+
# Already existed — fetch id
|
|
73
|
+
row = conn.execute("SELECT id FROM tracked_urls WHERE url = ?", (url,)).fetchone()
|
|
74
|
+
return row["id"] if row else 0
|
|
75
|
+
|
|
76
|
+
def get_tracked_url(self, url: str) -> dict | None:
|
|
77
|
+
"""Get a single tracked URL record."""
|
|
78
|
+
conn = self._get_conn()
|
|
79
|
+
row = conn.execute("SELECT * FROM tracked_urls WHERE url = ?", (url,)).fetchone()
|
|
80
|
+
return dict(row) if row else None
|
|
81
|
+
|
|
82
|
+
def get_all_tracked_urls(self) -> list[dict]:
|
|
83
|
+
"""Get all tracked URLs."""
|
|
84
|
+
conn = self._get_conn()
|
|
85
|
+
rows = conn.execute("SELECT * FROM tracked_urls ORDER BY created_at").fetchall()
|
|
86
|
+
return [dict(r) for r in rows]
|
|
87
|
+
|
|
88
|
+
def update_last_checked(self, url: str) -> None:
|
|
89
|
+
"""Update the last_checked_at timestamp for a URL."""
|
|
90
|
+
conn = self._get_conn()
|
|
91
|
+
conn.execute(
|
|
92
|
+
"UPDATE tracked_urls SET last_checked_at = CURRENT_TIMESTAMP WHERE url = ?",
|
|
93
|
+
(url,),
|
|
94
|
+
)
|
|
95
|
+
conn.commit()
|
|
96
|
+
|
|
97
|
+
def remove_tracked_url(self, url: str) -> bool:
|
|
98
|
+
"""Remove a URL from tracking. Returns True if removed."""
|
|
99
|
+
conn = self._get_conn()
|
|
100
|
+
conn.execute("DELETE FROM snapshots WHERE url = ?", (url,))
|
|
101
|
+
cur = conn.execute("DELETE FROM tracked_urls WHERE url = ?", (url,))
|
|
102
|
+
conn.commit()
|
|
103
|
+
return cur.rowcount > 0
|
|
104
|
+
|
|
105
|
+
# ── snapshots ─────────────────────────────────────────────
|
|
106
|
+
|
|
107
|
+
def add_snapshot(
|
|
108
|
+
self,
|
|
109
|
+
url: str,
|
|
110
|
+
content_hash: str,
|
|
111
|
+
markdown: str,
|
|
112
|
+
title: str = "",
|
|
113
|
+
word_count: int = 0,
|
|
114
|
+
) -> int:
|
|
115
|
+
"""Store a new snapshot. Returns row id."""
|
|
116
|
+
conn = self._get_conn()
|
|
117
|
+
cur = conn.execute(
|
|
118
|
+
"INSERT INTO snapshots (url, content_hash, markdown, title, word_count) VALUES (?, ?, ?, ?, ?)",
|
|
119
|
+
(url, content_hash, markdown, title, word_count),
|
|
120
|
+
)
|
|
121
|
+
conn.commit()
|
|
122
|
+
return cur.lastrowid or 0
|
|
123
|
+
|
|
124
|
+
def get_latest_snapshot(self, url: str) -> dict | None:
|
|
125
|
+
"""Get the most recent snapshot for a URL."""
|
|
126
|
+
conn = self._get_conn()
|
|
127
|
+
row = conn.execute(
|
|
128
|
+
"SELECT * FROM snapshots WHERE url = ? ORDER BY captured_at DESC LIMIT 1",
|
|
129
|
+
(url,),
|
|
130
|
+
).fetchone()
|
|
131
|
+
return dict(row) if row else None
|
|
132
|
+
|
|
133
|
+
def get_snapshot_by_id(self, snapshot_id: int) -> dict | None:
|
|
134
|
+
"""Get a snapshot by its ID."""
|
|
135
|
+
conn = self._get_conn()
|
|
136
|
+
row = conn.execute("SELECT * FROM snapshots WHERE id = ?", (snapshot_id,)).fetchone()
|
|
137
|
+
return dict(row) if row else None
|
|
138
|
+
|
|
139
|
+
def get_snapshots(self, url: str, count: int = 10) -> list[dict]:
|
|
140
|
+
"""Get recent snapshots for a URL, newest first."""
|
|
141
|
+
conn = self._get_conn()
|
|
142
|
+
rows = conn.execute(
|
|
143
|
+
"SELECT * FROM snapshots WHERE url = ? ORDER BY captured_at DESC LIMIT ?",
|
|
144
|
+
(url, count),
|
|
145
|
+
).fetchall()
|
|
146
|
+
return [dict(r) for r in rows]
|
|
147
|
+
|
|
148
|
+
def get_snapshot_pair(self, url: str) -> tuple[dict | None, dict | None]:
|
|
149
|
+
"""Get the two most recent snapshots (before, after). Returns (older, newer)."""
|
|
150
|
+
conn = self._get_conn()
|
|
151
|
+
rows = conn.execute(
|
|
152
|
+
"SELECT * FROM snapshots WHERE url = ? ORDER BY captured_at DESC LIMIT 2",
|
|
153
|
+
(url,),
|
|
154
|
+
).fetchall()
|
|
155
|
+
if len(rows) == 0:
|
|
156
|
+
return None, None
|
|
157
|
+
if len(rows) == 1:
|
|
158
|
+
return None, dict(rows[0])
|
|
159
|
+
return dict(rows[1]), dict(rows[0])
|
|
160
|
+
|
|
161
|
+
# ── lifecycle ─────────────────────────────────────────────
|
|
162
|
+
|
|
163
|
+
def close(self) -> None:
|
|
164
|
+
"""Close the database connection."""
|
|
165
|
+
if self._conn is not None:
|
|
166
|
+
self._conn.close()
|
|
167
|
+
self._conn = None
|
diffgrab/differ.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Text diff engine — unified diff + section-level analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import difflib
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
# Markdown heading pattern (ATX style: # Heading)
|
|
10
|
+
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DiffResult:
|
|
15
|
+
"""Structured result of comparing two snapshots."""
|
|
16
|
+
|
|
17
|
+
url: str
|
|
18
|
+
changed: bool
|
|
19
|
+
added_lines: int = 0
|
|
20
|
+
removed_lines: int = 0
|
|
21
|
+
changed_sections: list[str] = field(default_factory=list)
|
|
22
|
+
unified_diff: str = ""
|
|
23
|
+
summary: str = ""
|
|
24
|
+
before_snapshot_id: int | None = None
|
|
25
|
+
after_snapshot_id: int | None = None
|
|
26
|
+
before_timestamp: str = ""
|
|
27
|
+
after_timestamp: str = ""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _count_diff_lines(diff_text: str) -> tuple[int, int]:
|
|
31
|
+
"""Count added and removed lines from unified diff output."""
|
|
32
|
+
added = 0
|
|
33
|
+
removed = 0
|
|
34
|
+
for line in diff_text.splitlines():
|
|
35
|
+
if line.startswith("+") and not line.startswith("+++"):
|
|
36
|
+
added += 1
|
|
37
|
+
elif line.startswith("-") and not line.startswith("---"):
|
|
38
|
+
removed += 1
|
|
39
|
+
return added, removed
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _find_changed_sections(before_text: str, after_text: str) -> list[str]:
|
|
43
|
+
"""Identify markdown headings whose sections contain changes.
|
|
44
|
+
|
|
45
|
+
Strategy: split each document by headings, compare section content.
|
|
46
|
+
"""
|
|
47
|
+
before_sections = _split_by_headings(before_text)
|
|
48
|
+
after_sections = _split_by_headings(after_text)
|
|
49
|
+
|
|
50
|
+
changed: list[str] = []
|
|
51
|
+
all_headings = set(before_sections.keys()) | set(after_sections.keys())
|
|
52
|
+
|
|
53
|
+
for heading in sorted(all_headings):
|
|
54
|
+
b_content = before_sections.get(heading, "")
|
|
55
|
+
a_content = after_sections.get(heading, "")
|
|
56
|
+
if b_content != a_content:
|
|
57
|
+
changed.append(heading)
|
|
58
|
+
|
|
59
|
+
return changed
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _split_by_headings(text: str) -> dict[str, str]:
|
|
63
|
+
"""Split markdown text into sections keyed by heading.
|
|
64
|
+
|
|
65
|
+
Text before the first heading goes under "(top)".
|
|
66
|
+
"""
|
|
67
|
+
sections: dict[str, str] = {}
|
|
68
|
+
current_heading = "(top)"
|
|
69
|
+
current_lines: list[str] = []
|
|
70
|
+
|
|
71
|
+
for line in text.splitlines():
|
|
72
|
+
match = _HEADING_RE.match(line)
|
|
73
|
+
if match:
|
|
74
|
+
# Save previous section
|
|
75
|
+
sections[current_heading] = "\n".join(current_lines)
|
|
76
|
+
current_heading = match.group(2).strip()
|
|
77
|
+
current_lines = []
|
|
78
|
+
else:
|
|
79
|
+
current_lines.append(line)
|
|
80
|
+
|
|
81
|
+
sections[current_heading] = "\n".join(current_lines)
|
|
82
|
+
return sections
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _generate_summary(
|
|
86
|
+
added: int,
|
|
87
|
+
removed: int,
|
|
88
|
+
changed_sections: list[str],
|
|
89
|
+
url: str,
|
|
90
|
+
) -> str:
|
|
91
|
+
"""Generate a human-readable summary of changes."""
|
|
92
|
+
if added == 0 and removed == 0:
|
|
93
|
+
return f"No changes detected for {url}."
|
|
94
|
+
|
|
95
|
+
parts: list[str] = []
|
|
96
|
+
|
|
97
|
+
if added > 0 and removed > 0:
|
|
98
|
+
parts.append(f"{added} lines added, {removed} lines removed")
|
|
99
|
+
elif added > 0:
|
|
100
|
+
parts.append(f"{added} lines added")
|
|
101
|
+
else:
|
|
102
|
+
parts.append(f"{removed} lines removed")
|
|
103
|
+
|
|
104
|
+
if changed_sections:
|
|
105
|
+
section_names = ", ".join(changed_sections[:5])
|
|
106
|
+
if len(changed_sections) > 5:
|
|
107
|
+
section_names += f" (+{len(changed_sections) - 5} more)"
|
|
108
|
+
parts.append(f"in sections: {section_names}")
|
|
109
|
+
|
|
110
|
+
return ". ".join(parts) + "."
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def compute_diff(
|
|
114
|
+
before_text: str,
|
|
115
|
+
after_text: str,
|
|
116
|
+
url: str = "",
|
|
117
|
+
before_snapshot_id: int | None = None,
|
|
118
|
+
after_snapshot_id: int | None = None,
|
|
119
|
+
before_timestamp: str = "",
|
|
120
|
+
after_timestamp: str = "",
|
|
121
|
+
) -> DiffResult:
|
|
122
|
+
"""Compute structured diff between two markdown texts.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
before_text: The older markdown content.
|
|
126
|
+
after_text: The newer markdown content.
|
|
127
|
+
url: The URL being compared.
|
|
128
|
+
before_snapshot_id: Database ID of the older snapshot.
|
|
129
|
+
after_snapshot_id: Database ID of the newer snapshot.
|
|
130
|
+
before_timestamp: Timestamp of the older snapshot.
|
|
131
|
+
after_timestamp: Timestamp of the newer snapshot.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
DiffResult with all diff details.
|
|
135
|
+
"""
|
|
136
|
+
changed = before_text != after_text
|
|
137
|
+
|
|
138
|
+
if not changed:
|
|
139
|
+
return DiffResult(
|
|
140
|
+
url=url,
|
|
141
|
+
changed=False,
|
|
142
|
+
summary=f"No changes detected for {url}.",
|
|
143
|
+
before_snapshot_id=before_snapshot_id,
|
|
144
|
+
after_snapshot_id=after_snapshot_id,
|
|
145
|
+
before_timestamp=before_timestamp,
|
|
146
|
+
after_timestamp=after_timestamp,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
before_lines = before_text.splitlines(keepends=True)
|
|
150
|
+
after_lines = after_text.splitlines(keepends=True)
|
|
151
|
+
|
|
152
|
+
diff_lines = list(
|
|
153
|
+
difflib.unified_diff(
|
|
154
|
+
before_lines,
|
|
155
|
+
after_lines,
|
|
156
|
+
fromfile=f"before ({before_timestamp})" if before_timestamp else "before",
|
|
157
|
+
tofile=f"after ({after_timestamp})" if after_timestamp else "after",
|
|
158
|
+
lineterm="",
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
unified_diff = "\n".join(diff_lines)
|
|
162
|
+
|
|
163
|
+
added, removed = _count_diff_lines(unified_diff)
|
|
164
|
+
changed_sections = _find_changed_sections(before_text, after_text)
|
|
165
|
+
summary = _generate_summary(added, removed, changed_sections, url)
|
|
166
|
+
|
|
167
|
+
return DiffResult(
|
|
168
|
+
url=url,
|
|
169
|
+
changed=True,
|
|
170
|
+
added_lines=added,
|
|
171
|
+
removed_lines=removed,
|
|
172
|
+
changed_sections=changed_sections,
|
|
173
|
+
unified_diff=unified_diff,
|
|
174
|
+
summary=summary,
|
|
175
|
+
before_snapshot_id=before_snapshot_id,
|
|
176
|
+
after_snapshot_id=after_snapshot_id,
|
|
177
|
+
before_timestamp=before_timestamp,
|
|
178
|
+
after_timestamp=after_timestamp,
|
|
179
|
+
)
|