diffgrab 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffgrab/__init__.py ADDED
@@ -0,0 +1,109 @@
1
+ """diffgrab — Web page change tracking with structured diffs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from diffgrab.differ import DiffResult
6
+ from diffgrab.tracker import DiffTracker
7
+
8
+ __all__ = ["DiffTracker", "DiffResult", "track", "check", "diff", "history", "untrack"]
9
+ __version__ = "0.1.0"
10
+
11
+
12
+ async def track(url: str, interval_hours: int = 24, *, db_path: str = "") -> str:
13
+ """Register a URL for change tracking.
14
+
15
+ Args:
16
+ url: The URL to track.
17
+ interval_hours: Check interval in hours (default: 24).
18
+ db_path: Custom database path (optional).
19
+
20
+ Returns:
21
+ Status message.
22
+ """
23
+ kwargs = {"db_path": db_path} if db_path else {}
24
+ tracker = DiffTracker(**kwargs)
25
+ try:
26
+ return await tracker.track(url, interval_hours)
27
+ finally:
28
+ await tracker.close()
29
+
30
+
31
+ async def check(url: str | None = None, *, db_path: str = "") -> list[DiffResult]:
32
+ """Check tracked URLs for changes.
33
+
34
+ Args:
35
+ url: Specific URL to check, or None for all.
36
+ db_path: Custom database path (optional).
37
+
38
+ Returns:
39
+ List of DiffResult objects.
40
+ """
41
+ kwargs = {"db_path": db_path} if db_path else {}
42
+ tracker = DiffTracker(**kwargs)
43
+ try:
44
+ return await tracker.check(url)
45
+ finally:
46
+ await tracker.close()
47
+
48
+
49
+ async def diff(
50
+ url: str,
51
+ before_id: int | None = None,
52
+ after_id: int | None = None,
53
+ *,
54
+ db_path: str = "",
55
+ ) -> DiffResult:
56
+ """Get structured diff between two snapshots of a URL.
57
+
58
+ Args:
59
+ url: The URL to diff.
60
+ before_id: Database ID of the older snapshot.
61
+ after_id: Database ID of the newer snapshot.
62
+ db_path: Custom database path (optional).
63
+
64
+ Returns:
65
+ DiffResult with structured diff.
66
+ """
67
+ kwargs = {"db_path": db_path} if db_path else {}
68
+ tracker = DiffTracker(**kwargs)
69
+ try:
70
+ return await tracker.diff(url, before_id, after_id)
71
+ finally:
72
+ await tracker.close()
73
+
74
+
75
+ async def history(url: str, count: int = 10, *, db_path: str = "") -> list[dict]:
76
+ """Get snapshot history for a URL.
77
+
78
+ Args:
79
+ url: The URL to get history for.
80
+ count: Maximum number of snapshots (default: 10).
81
+ db_path: Custom database path (optional).
82
+
83
+ Returns:
84
+ List of snapshot metadata dicts.
85
+ """
86
+ kwargs = {"db_path": db_path} if db_path else {}
87
+ tracker = DiffTracker(**kwargs)
88
+ try:
89
+ return await tracker.history(url, count)
90
+ finally:
91
+ await tracker.close()
92
+
93
+
94
+ async def untrack(url: str, *, db_path: str = "") -> str:
95
+ """Remove a URL from tracking.
96
+
97
+ Args:
98
+ url: The URL to untrack.
99
+ db_path: Custom database path (optional).
100
+
101
+ Returns:
102
+ Status message.
103
+ """
104
+ kwargs = {"db_path": db_path} if db_path else {}
105
+ tracker = DiffTracker(**kwargs)
106
+ try:
107
+ return await tracker.untrack(url)
108
+ finally:
109
+ await tracker.close()
diffgrab/__main__.py ADDED
@@ -0,0 +1,132 @@
1
+ """CLI interface for diffgrab — web page change tracking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import sys
7
+
8
+
9
+ def main() -> None:
10
+ """Entry point for the diffgrab CLI."""
11
+ try:
12
+ from click import argument, group, option
13
+ from rich.console import Console
14
+ from rich.table import Table
15
+ except ImportError:
16
+ print("CLI dependencies not installed. Run: pip install 'diffgrab[cli]'", file=sys.stderr)
17
+ sys.exit(1)
18
+
19
+ console = Console()
20
+
21
+ @group()
22
+ def cli() -> None:
23
+ """diffgrab — Web page change tracking with structured diffs."""
24
+
25
+ @cli.command()
26
+ @argument("url")
27
+ @option("--interval", default=24, type=int, help="Check interval in hours (default: 24).")
28
+ @option("--db", default="", help="Custom database path.")
29
+ def track(url: str, interval: int, db: str) -> None:
30
+ """Register a URL for change tracking."""
31
+ from diffgrab import track as _track
32
+
33
+ kwargs = {"db_path": db} if db else {}
34
+ result = asyncio.run(_track(url, interval, **kwargs))
35
+ console.print(result)
36
+
37
+ @cli.command()
38
+ @argument("url", required=False, default=None)
39
+ @option("--db", default="", help="Custom database path.")
40
+ def check(url: str | None, db: str) -> None:
41
+ """Check tracked URLs for changes."""
42
+ from diffgrab import check as _check
43
+
44
+ kwargs = {"db_path": db} if db else {}
45
+ results = asyncio.run(_check(url, **kwargs))
46
+
47
+ if not results:
48
+ console.print("[dim]No tracked URLs found.[/dim]")
49
+ return
50
+
51
+ for r in results:
52
+ if r.changed:
53
+ console.print(f"[bold red]CHANGED[/bold red] {r.url}")
54
+ console.print(f" +{r.added_lines} / -{r.removed_lines} lines")
55
+ if r.changed_sections:
56
+ console.print(f" Sections: {', '.join(r.changed_sections[:5])}")
57
+ console.print(f" {r.summary}")
58
+ else:
59
+ console.print(f"[green]OK[/green] {r.url} — {r.summary}")
60
+
61
+ @cli.command()
62
+ @argument("url")
63
+ @option("--before", "before_id", default=None, type=int, help="Snapshot ID for before.")
64
+ @option("--after", "after_id", default=None, type=int, help="Snapshot ID for after.")
65
+ @option("--db", default="", help="Custom database path.")
66
+ def diff(url: str, before_id: int | None, after_id: int | None, db: str) -> None:
67
+ """Show structured diff between two snapshots."""
68
+ from diffgrab import diff as _diff
69
+
70
+ kwargs = {"db_path": db} if db else {}
71
+ result = asyncio.run(_diff(url, before_id, after_id, **kwargs))
72
+
73
+ if not result.changed:
74
+ console.print(f"[green]{result.summary}[/green]")
75
+ return
76
+
77
+ console.print(f"[bold]Diff for {result.url}[/bold]")
78
+ console.print(f"+{result.added_lines} / -{result.removed_lines} lines")
79
+ if result.changed_sections:
80
+ console.print(f"Sections: {', '.join(result.changed_sections)}")
81
+ console.print()
82
+ console.print(result.unified_diff)
83
+
84
+ @cli.command()
85
+ @argument("url")
86
+ @option("--count", default=10, type=int, help="Number of snapshots to show (default: 10).")
87
+ @option("--db", default="", help="Custom database path.")
88
+ def history(url: str, count: int, db: str) -> None:
89
+ """Show snapshot history for a URL."""
90
+ from diffgrab import history as _history
91
+
92
+ kwargs = {"db_path": db} if db else {}
93
+ snapshots = asyncio.run(_history(url, count, **kwargs))
94
+
95
+ if not snapshots:
96
+ console.print(f"[dim]No snapshots for {url}[/dim]")
97
+ return
98
+
99
+ table = Table(title=f"Snapshots for {url}")
100
+ table.add_column("ID", style="cyan")
101
+ table.add_column("Title")
102
+ table.add_column("Words", justify="right")
103
+ table.add_column("Hash", max_width=12)
104
+ table.add_column("Captured At")
105
+
106
+ for s in snapshots:
107
+ table.add_row(
108
+ str(s["id"]),
109
+ s.get("title", ""),
110
+ str(s.get("word_count", 0)),
111
+ s["content_hash"][:12],
112
+ s["captured_at"],
113
+ )
114
+
115
+ console.print(table)
116
+
117
+ @cli.command()
118
+ @argument("url")
119
+ @option("--db", default="", help="Custom database path.")
120
+ def untrack(url: str, db: str) -> None:
121
+ """Remove a URL from tracking."""
122
+ from diffgrab import untrack as _untrack
123
+
124
+ kwargs = {"db_path": db} if db else {}
125
+ result = asyncio.run(_untrack(url, **kwargs))
126
+ console.print(result)
127
+
128
+ cli()
129
+
130
+
131
+ if __name__ == "__main__":
132
+ main()
diffgrab/db.py ADDED
@@ -0,0 +1,167 @@
1
+ """SQLite storage for tracked URLs and snapshots."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sqlite3
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ DEFAULT_DB_PATH = "~/.local/share/diffgrab/diffgrab.db"
12
+
13
+ _SCHEMA = """\
14
+ CREATE TABLE IF NOT EXISTS tracked_urls (
15
+ id INTEGER PRIMARY KEY,
16
+ url TEXT UNIQUE NOT NULL,
17
+ interval_hours INTEGER DEFAULT 24,
18
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
19
+ last_checked_at TEXT
20
+ );
21
+
22
+ CREATE TABLE IF NOT EXISTS snapshots (
23
+ id INTEGER PRIMARY KEY,
24
+ url TEXT NOT NULL,
25
+ content_hash TEXT NOT NULL,
26
+ markdown TEXT NOT NULL,
27
+ title TEXT DEFAULT '',
28
+ word_count INTEGER DEFAULT 0,
29
+ captured_at TEXT DEFAULT CURRENT_TIMESTAMP,
30
+ FOREIGN KEY (url) REFERENCES tracked_urls(url)
31
+ );
32
+ """
33
+
34
+
35
+ class Database:
36
+ """SQLite database for diffgrab storage."""
37
+
38
+ def __init__(self, db_path: str = DEFAULT_DB_PATH) -> None:
39
+ resolved = Path(db_path).expanduser()
40
+ resolved.parent.mkdir(parents=True, exist_ok=True)
41
+ self._path = str(resolved)
42
+ self._conn: sqlite3.Connection | None = None
43
+
44
+ def _get_conn(self) -> sqlite3.Connection:
45
+ if self._conn is None:
46
+ self._conn = sqlite3.connect(self._path)
47
+ self._conn.row_factory = sqlite3.Row
48
+ self._conn.execute("PRAGMA journal_mode=WAL")
49
+ self._conn.execute("PRAGMA foreign_keys=ON")
50
+ self._init_schema()
51
+ return self._conn
52
+
53
+ def _init_schema(self) -> None:
54
+ conn = self._conn
55
+ if conn is None:
56
+ return
57
+ conn.executescript(_SCHEMA)
58
+ conn.commit()
59
+
60
+ # ── tracked_urls ──────────────────────────────────────────
61
+
62
+ def add_tracked_url(self, url: str, interval_hours: int = 24) -> int:
63
+ """Add a URL to tracking. Returns row id."""
64
+ conn = self._get_conn()
65
+ cur = conn.execute(
66
+ "INSERT OR IGNORE INTO tracked_urls (url, interval_hours) VALUES (?, ?)",
67
+ (url, interval_hours),
68
+ )
69
+ conn.commit()
70
+ if cur.lastrowid and cur.rowcount > 0:
71
+ return cur.lastrowid
72
+ # Already existed — fetch id
73
+ row = conn.execute("SELECT id FROM tracked_urls WHERE url = ?", (url,)).fetchone()
74
+ return row["id"] if row else 0
75
+
76
+ def get_tracked_url(self, url: str) -> dict | None:
77
+ """Get a single tracked URL record."""
78
+ conn = self._get_conn()
79
+ row = conn.execute("SELECT * FROM tracked_urls WHERE url = ?", (url,)).fetchone()
80
+ return dict(row) if row else None
81
+
82
+ def get_all_tracked_urls(self) -> list[dict]:
83
+ """Get all tracked URLs."""
84
+ conn = self._get_conn()
85
+ rows = conn.execute("SELECT * FROM tracked_urls ORDER BY created_at").fetchall()
86
+ return [dict(r) for r in rows]
87
+
88
+ def update_last_checked(self, url: str) -> None:
89
+ """Update the last_checked_at timestamp for a URL."""
90
+ conn = self._get_conn()
91
+ conn.execute(
92
+ "UPDATE tracked_urls SET last_checked_at = CURRENT_TIMESTAMP WHERE url = ?",
93
+ (url,),
94
+ )
95
+ conn.commit()
96
+
97
+ def remove_tracked_url(self, url: str) -> bool:
98
+ """Remove a URL from tracking. Returns True if removed."""
99
+ conn = self._get_conn()
100
+ conn.execute("DELETE FROM snapshots WHERE url = ?", (url,))
101
+ cur = conn.execute("DELETE FROM tracked_urls WHERE url = ?", (url,))
102
+ conn.commit()
103
+ return cur.rowcount > 0
104
+
105
+ # ── snapshots ─────────────────────────────────────────────
106
+
107
+ def add_snapshot(
108
+ self,
109
+ url: str,
110
+ content_hash: str,
111
+ markdown: str,
112
+ title: str = "",
113
+ word_count: int = 0,
114
+ ) -> int:
115
+ """Store a new snapshot. Returns row id."""
116
+ conn = self._get_conn()
117
+ cur = conn.execute(
118
+ "INSERT INTO snapshots (url, content_hash, markdown, title, word_count) VALUES (?, ?, ?, ?, ?)",
119
+ (url, content_hash, markdown, title, word_count),
120
+ )
121
+ conn.commit()
122
+ return cur.lastrowid or 0
123
+
124
+ def get_latest_snapshot(self, url: str) -> dict | None:
125
+ """Get the most recent snapshot for a URL."""
126
+ conn = self._get_conn()
127
+ row = conn.execute(
128
+ "SELECT * FROM snapshots WHERE url = ? ORDER BY captured_at DESC LIMIT 1",
129
+ (url,),
130
+ ).fetchone()
131
+ return dict(row) if row else None
132
+
133
+ def get_snapshot_by_id(self, snapshot_id: int) -> dict | None:
134
+ """Get a snapshot by its ID."""
135
+ conn = self._get_conn()
136
+ row = conn.execute("SELECT * FROM snapshots WHERE id = ?", (snapshot_id,)).fetchone()
137
+ return dict(row) if row else None
138
+
139
+ def get_snapshots(self, url: str, count: int = 10) -> list[dict]:
140
+ """Get recent snapshots for a URL, newest first."""
141
+ conn = self._get_conn()
142
+ rows = conn.execute(
143
+ "SELECT * FROM snapshots WHERE url = ? ORDER BY captured_at DESC LIMIT ?",
144
+ (url, count),
145
+ ).fetchall()
146
+ return [dict(r) for r in rows]
147
+
148
+ def get_snapshot_pair(self, url: str) -> tuple[dict | None, dict | None]:
149
+ """Get the two most recent snapshots (before, after). Returns (older, newer)."""
150
+ conn = self._get_conn()
151
+ rows = conn.execute(
152
+ "SELECT * FROM snapshots WHERE url = ? ORDER BY captured_at DESC LIMIT 2",
153
+ (url,),
154
+ ).fetchall()
155
+ if len(rows) == 0:
156
+ return None, None
157
+ if len(rows) == 1:
158
+ return None, dict(rows[0])
159
+ return dict(rows[1]), dict(rows[0])
160
+
161
+ # ── lifecycle ─────────────────────────────────────────────
162
+
163
+ def close(self) -> None:
164
+ """Close the database connection."""
165
+ if self._conn is not None:
166
+ self._conn.close()
167
+ self._conn = None
diffgrab/differ.py ADDED
@@ -0,0 +1,179 @@
1
+ """Text diff engine — unified diff + section-level analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import difflib
6
+ import re
7
+ from dataclasses import dataclass, field
8
+
9
+ # Markdown heading pattern (ATX style: # Heading)
10
+ _HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
11
+
12
+
13
+ @dataclass
14
+ class DiffResult:
15
+ """Structured result of comparing two snapshots."""
16
+
17
+ url: str
18
+ changed: bool
19
+ added_lines: int = 0
20
+ removed_lines: int = 0
21
+ changed_sections: list[str] = field(default_factory=list)
22
+ unified_diff: str = ""
23
+ summary: str = ""
24
+ before_snapshot_id: int | None = None
25
+ after_snapshot_id: int | None = None
26
+ before_timestamp: str = ""
27
+ after_timestamp: str = ""
28
+
29
+
30
+ def _count_diff_lines(diff_text: str) -> tuple[int, int]:
31
+ """Count added and removed lines from unified diff output."""
32
+ added = 0
33
+ removed = 0
34
+ for line in diff_text.splitlines():
35
+ if line.startswith("+") and not line.startswith("+++"):
36
+ added += 1
37
+ elif line.startswith("-") and not line.startswith("---"):
38
+ removed += 1
39
+ return added, removed
40
+
41
+
42
+ def _find_changed_sections(before_text: str, after_text: str) -> list[str]:
43
+ """Identify markdown headings whose sections contain changes.
44
+
45
+ Strategy: split each document by headings, compare section content.
46
+ """
47
+ before_sections = _split_by_headings(before_text)
48
+ after_sections = _split_by_headings(after_text)
49
+
50
+ changed: list[str] = []
51
+ all_headings = set(before_sections.keys()) | set(after_sections.keys())
52
+
53
+ for heading in sorted(all_headings):
54
+ b_content = before_sections.get(heading, "")
55
+ a_content = after_sections.get(heading, "")
56
+ if b_content != a_content:
57
+ changed.append(heading)
58
+
59
+ return changed
60
+
61
+
62
+ def _split_by_headings(text: str) -> dict[str, str]:
63
+ """Split markdown text into sections keyed by heading.
64
+
65
+ Text before the first heading goes under "(top)".
66
+ """
67
+ sections: dict[str, str] = {}
68
+ current_heading = "(top)"
69
+ current_lines: list[str] = []
70
+
71
+ for line in text.splitlines():
72
+ match = _HEADING_RE.match(line)
73
+ if match:
74
+ # Save previous section
75
+ sections[current_heading] = "\n".join(current_lines)
76
+ current_heading = match.group(2).strip()
77
+ current_lines = []
78
+ else:
79
+ current_lines.append(line)
80
+
81
+ sections[current_heading] = "\n".join(current_lines)
82
+ return sections
83
+
84
+
85
+ def _generate_summary(
86
+ added: int,
87
+ removed: int,
88
+ changed_sections: list[str],
89
+ url: str,
90
+ ) -> str:
91
+ """Generate a human-readable summary of changes."""
92
+ if added == 0 and removed == 0:
93
+ return f"No changes detected for {url}."
94
+
95
+ parts: list[str] = []
96
+
97
+ if added > 0 and removed > 0:
98
+ parts.append(f"{added} lines added, {removed} lines removed")
99
+ elif added > 0:
100
+ parts.append(f"{added} lines added")
101
+ else:
102
+ parts.append(f"{removed} lines removed")
103
+
104
+ if changed_sections:
105
+ section_names = ", ".join(changed_sections[:5])
106
+ if len(changed_sections) > 5:
107
+ section_names += f" (+{len(changed_sections) - 5} more)"
108
+ parts.append(f"in sections: {section_names}")
109
+
110
+ return ". ".join(parts) + "."
111
+
112
+
113
+ def compute_diff(
114
+ before_text: str,
115
+ after_text: str,
116
+ url: str = "",
117
+ before_snapshot_id: int | None = None,
118
+ after_snapshot_id: int | None = None,
119
+ before_timestamp: str = "",
120
+ after_timestamp: str = "",
121
+ ) -> DiffResult:
122
+ """Compute structured diff between two markdown texts.
123
+
124
+ Args:
125
+ before_text: The older markdown content.
126
+ after_text: The newer markdown content.
127
+ url: The URL being compared.
128
+ before_snapshot_id: Database ID of the older snapshot.
129
+ after_snapshot_id: Database ID of the newer snapshot.
130
+ before_timestamp: Timestamp of the older snapshot.
131
+ after_timestamp: Timestamp of the newer snapshot.
132
+
133
+ Returns:
134
+ DiffResult with all diff details.
135
+ """
136
+ changed = before_text != after_text
137
+
138
+ if not changed:
139
+ return DiffResult(
140
+ url=url,
141
+ changed=False,
142
+ summary=f"No changes detected for {url}.",
143
+ before_snapshot_id=before_snapshot_id,
144
+ after_snapshot_id=after_snapshot_id,
145
+ before_timestamp=before_timestamp,
146
+ after_timestamp=after_timestamp,
147
+ )
148
+
149
+ before_lines = before_text.splitlines(keepends=True)
150
+ after_lines = after_text.splitlines(keepends=True)
151
+
152
+ diff_lines = list(
153
+ difflib.unified_diff(
154
+ before_lines,
155
+ after_lines,
156
+ fromfile=f"before ({before_timestamp})" if before_timestamp else "before",
157
+ tofile=f"after ({after_timestamp})" if after_timestamp else "after",
158
+ lineterm="",
159
+ )
160
+ )
161
+ unified_diff = "\n".join(diff_lines)
162
+
163
+ added, removed = _count_diff_lines(unified_diff)
164
+ changed_sections = _find_changed_sections(before_text, after_text)
165
+ summary = _generate_summary(added, removed, changed_sections, url)
166
+
167
+ return DiffResult(
168
+ url=url,
169
+ changed=True,
170
+ added_lines=added,
171
+ removed_lines=removed,
172
+ changed_sections=changed_sections,
173
+ unified_diff=unified_diff,
174
+ summary=summary,
175
+ before_snapshot_id=before_snapshot_id,
176
+ after_snapshot_id=after_snapshot_id,
177
+ before_timestamp=before_timestamp,
178
+ after_timestamp=after_timestamp,
179
+ )