diffgrab 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffgrab/mcp_server.py ADDED
@@ -0,0 +1,138 @@
1
+ """MCP server for diffgrab — 5 tools for web page change tracking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+
8
+ try:
9
+ from fastmcp import FastMCP
10
+ except ImportError:
11
+ print("MCP dependencies not installed. Run: pip install 'diffgrab[mcp]'", file=sys.stderr)
12
+ sys.exit(1)
13
+
14
+ from diffgrab.differ import DiffResult
15
+ from diffgrab.tracker import DiffTracker
16
+
17
+ mcp = FastMCP("diffgrab", instructions="Web page change tracking with structured diffs.")
18
+
19
+ # Shared tracker instance (lazily initialized)
20
+ _tracker: DiffTracker | None = None
21
+
22
+
23
+ def _get_tracker() -> DiffTracker:
24
+ global _tracker
25
+ if _tracker is None:
26
+ _tracker = DiffTracker()
27
+ return _tracker
28
+
29
+
30
+ def _diff_result_to_json(result: DiffResult) -> str:
31
+ """Serialize a DiffResult to JSON string."""
32
+ return json.dumps(
33
+ {
34
+ "url": result.url,
35
+ "changed": result.changed,
36
+ "added_lines": result.added_lines,
37
+ "removed_lines": result.removed_lines,
38
+ "changed_sections": result.changed_sections,
39
+ "unified_diff": result.unified_diff[:2000] if result.unified_diff else "",
40
+ "summary": result.summary,
41
+ "before_snapshot_id": result.before_snapshot_id,
42
+ "after_snapshot_id": result.after_snapshot_id,
43
+ "before_timestamp": result.before_timestamp,
44
+ "after_timestamp": result.after_timestamp,
45
+ },
46
+ ensure_ascii=False,
47
+ indent=2,
48
+ )
49
+
50
+
51
+ @mcp.tool()
52
+ async def track_url(url: str, interval_hours: int = 24) -> str:
53
+ """Register a URL for change tracking. Takes an initial snapshot.
54
+
55
+ Args:
56
+ url: The URL to track for changes.
57
+ interval_hours: How often to check for changes (default: 24 hours).
58
+
59
+ Returns:
60
+ Status message confirming tracking registration.
61
+ """
62
+ tracker = _get_tracker()
63
+ return await tracker.track(url, interval_hours)
64
+
65
+
66
+ @mcp.tool()
67
+ async def check_changes(url: str | None = None) -> str:
68
+ """Check tracked URLs for changes. Compares current content with last snapshot.
69
+
70
+ Args:
71
+ url: Specific URL to check, or None to check all tracked URLs.
72
+
73
+ Returns:
74
+ JSON array of change results for each checked URL.
75
+ """
76
+ tracker = _get_tracker()
77
+ results = await tracker.check(url)
78
+ return json.dumps(
79
+ [json.loads(_diff_result_to_json(r)) for r in results],
80
+ ensure_ascii=False,
81
+ indent=2,
82
+ )
83
+
84
+
85
+ @mcp.tool()
86
+ async def get_diff(url: str, before_id: int | None = None, after_id: int | None = None) -> str:
87
+ """Get structured diff between two snapshots of a URL.
88
+
89
+ Args:
90
+ url: The URL to get diff for.
91
+ before_id: Snapshot ID for the older version (optional, uses latest pair if omitted).
92
+ after_id: Snapshot ID for the newer version (optional, uses latest pair if omitted).
93
+
94
+ Returns:
95
+ JSON object with diff details including unified diff and changed sections.
96
+ """
97
+ tracker = _get_tracker()
98
+ result = await tracker.diff(url, before_id, after_id)
99
+ return _diff_result_to_json(result)
100
+
101
+
102
+ @mcp.tool()
103
+ async def get_history(url: str, count: int = 10) -> str:
104
+ """Get snapshot history for a tracked URL.
105
+
106
+ Args:
107
+ url: The URL to get history for.
108
+ count: Maximum number of snapshots to return (default: 10).
109
+
110
+ Returns:
111
+ JSON array of snapshot metadata (id, title, word_count, hash, timestamp).
112
+ """
113
+ tracker = _get_tracker()
114
+ snapshots = await tracker.history(url, count)
115
+ return json.dumps(snapshots, ensure_ascii=False, indent=2)
116
+
117
+
118
+ @mcp.tool()
119
+ async def untrack_url(url: str) -> str:
120
+ """Remove a URL from change tracking. Deletes all stored snapshots.
121
+
122
+ Args:
123
+ url: The URL to stop tracking.
124
+
125
+ Returns:
126
+ Status message confirming removal.
127
+ """
128
+ tracker = _get_tracker()
129
+ return await tracker.untrack(url)
130
+
131
+
132
+ def main() -> None:
133
+ """Run the MCP server."""
134
+ mcp.run()
135
+
136
+
137
+ if __name__ == "__main__":
138
+ main()
diffgrab/tracker.py ADDED
@@ -0,0 +1,241 @@
1
+ """DiffTracker — main orchestrator for web page change tracking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import logging
7
+
8
+ from markgrab import extract as mg_extract
9
+
10
+ from diffgrab.db import DEFAULT_DB_PATH, Database
11
+ from diffgrab.differ import DiffResult, compute_diff
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def _content_hash(text: str) -> str:
17
+ """Compute SHA-256 hash of text content."""
18
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
19
+
20
+
21
+ class DiffTracker:
22
+ """Main orchestrator for tracking web page changes.
23
+
24
+ Usage::
25
+
26
+ tracker = DiffTracker()
27
+ await tracker.track("https://example.com")
28
+ changes = await tracker.check()
29
+ await tracker.close()
30
+ """
31
+
32
+ def __init__(self, db_path: str = DEFAULT_DB_PATH) -> None:
33
+ self._db = Database(db_path)
34
+
35
+ async def track(self, url: str, interval_hours: int = 24) -> str:
36
+ """Register URL for tracking and take initial snapshot.
37
+
38
+ Args:
39
+ url: The URL to track.
40
+ interval_hours: How often to check for changes (default: 24h).
41
+
42
+ Returns:
43
+ Status message.
44
+ """
45
+ existing = self._db.get_tracked_url(url)
46
+ if existing:
47
+ return f"Already tracking: {url}"
48
+
49
+ self._db.add_tracked_url(url, interval_hours)
50
+
51
+ # Take initial snapshot
52
+ try:
53
+ result = await mg_extract(url)
54
+ markdown = result.markdown
55
+ title = result.title
56
+ word_count = result.word_count
57
+ except Exception as exc:
58
+ logger.error("Failed to fetch initial snapshot for %s: %s", url, exc)
59
+ return f"Tracking registered but initial snapshot failed: {exc}"
60
+
61
+ content = _content_hash(markdown)
62
+ self._db.add_snapshot(url, content, markdown, title, word_count)
63
+ self._db.update_last_checked(url)
64
+
65
+ return f"Now tracking: {url} (interval: {interval_hours}h, initial snapshot: {word_count} words)"
66
+
67
+ async def check(self, url: str | None = None) -> list[DiffResult]:
68
+ """Check tracked URLs for changes.
69
+
70
+ If url is None, checks all tracked URLs.
71
+ Fetches new content, compares hash with latest snapshot.
72
+ If different, stores new snapshot and returns DiffResult with changed=True.
73
+
74
+ Args:
75
+ url: Specific URL to check, or None for all tracked URLs.
76
+
77
+ Returns:
78
+ List of DiffResult objects (one per checked URL).
79
+ """
80
+ if url is not None:
81
+ tracked = self._db.get_tracked_url(url)
82
+ if tracked is None:
83
+ return [DiffResult(url=url, changed=False, summary=f"URL not tracked: {url}")]
84
+ urls_to_check = [tracked]
85
+ else:
86
+ urls_to_check = self._db.get_all_tracked_urls()
87
+
88
+ results: list[DiffResult] = []
89
+
90
+ for tracked_info in urls_to_check:
91
+ target_url = tracked_info["url"]
92
+ result = await self._check_single(target_url)
93
+ results.append(result)
94
+
95
+ return results
96
+
97
+ async def _check_single(self, url: str) -> DiffResult:
98
+ """Check a single URL for changes."""
99
+ try:
100
+ result = await mg_extract(url)
101
+ new_markdown = result.markdown
102
+ new_title = result.title
103
+ new_word_count = result.word_count
104
+ except Exception as exc:
105
+ logger.error("Failed to fetch %s: %s", url, exc)
106
+ return DiffResult(url=url, changed=False, summary=f"Fetch failed: {exc}")
107
+
108
+ new_hash = _content_hash(new_markdown)
109
+ latest = self._db.get_latest_snapshot(url)
110
+
111
+ if latest is None:
112
+ # No previous snapshot — store this one
113
+ self._db.add_snapshot(url, new_hash, new_markdown, new_title, new_word_count)
114
+ self._db.update_last_checked(url)
115
+ return DiffResult(url=url, changed=False, summary=f"First snapshot captured for {url}.")
116
+
117
+ if new_hash == latest["content_hash"]:
118
+ self._db.update_last_checked(url)
119
+ return DiffResult(
120
+ url=url,
121
+ changed=False,
122
+ summary=f"No changes detected for {url}.",
123
+ after_snapshot_id=latest["id"],
124
+ after_timestamp=latest["captured_at"],
125
+ )
126
+
127
+ # Content changed — store new snapshot and compute diff
128
+ new_id = self._db.add_snapshot(url, new_hash, new_markdown, new_title, new_word_count)
129
+ self._db.update_last_checked(url)
130
+
131
+ new_snapshot = self._db.get_snapshot_by_id(new_id)
132
+ after_timestamp = new_snapshot["captured_at"] if new_snapshot else ""
133
+
134
+ diff_result = compute_diff(
135
+ before_text=latest["markdown"],
136
+ after_text=new_markdown,
137
+ url=url,
138
+ before_snapshot_id=latest["id"],
139
+ after_snapshot_id=new_id,
140
+ before_timestamp=latest["captured_at"],
141
+ after_timestamp=after_timestamp,
142
+ )
143
+
144
+ return diff_result
145
+
146
+ async def diff(
147
+ self,
148
+ url: str,
149
+ before_id: int | None = None,
150
+ after_id: int | None = None,
151
+ ) -> DiffResult:
152
+ """Get structured diff between two snapshots.
153
+
154
+ If before_id/after_id are not provided, uses the two most recent snapshots.
155
+
156
+ Args:
157
+ url: The URL to diff.
158
+ before_id: Database ID of the older snapshot (optional).
159
+ after_id: Database ID of the newer snapshot (optional).
160
+
161
+ Returns:
162
+ DiffResult with structured diff details.
163
+ """
164
+ if before_id is not None and after_id is not None:
165
+ before = self._db.get_snapshot_by_id(before_id)
166
+ after = self._db.get_snapshot_by_id(after_id)
167
+ else:
168
+ before, after = self._db.get_snapshot_pair(url)
169
+
170
+ if before is None and after is None:
171
+ return DiffResult(url=url, changed=False, summary=f"No snapshots found for {url}.")
172
+
173
+ if before is None:
174
+ return DiffResult(
175
+ url=url,
176
+ changed=False,
177
+ summary=f"Only one snapshot exists for {url}. Need at least two for diff.",
178
+ after_snapshot_id=after["id"] if after else None,
179
+ after_timestamp=after["captured_at"] if after else "",
180
+ )
181
+
182
+ if after is None:
183
+ return DiffResult(
184
+ url=url,
185
+ changed=False,
186
+ summary=f"After snapshot not found for {url}.",
187
+ before_snapshot_id=before["id"],
188
+ before_timestamp=before["captured_at"],
189
+ )
190
+
191
+ return compute_diff(
192
+ before_text=before["markdown"],
193
+ after_text=after["markdown"],
194
+ url=url,
195
+ before_snapshot_id=before["id"],
196
+ after_snapshot_id=after["id"],
197
+ before_timestamp=before["captured_at"],
198
+ after_timestamp=after["captured_at"],
199
+ )
200
+
201
+ async def history(self, url: str, count: int = 10) -> list[dict]:
202
+ """Get snapshot history for a URL.
203
+
204
+ Args:
205
+ url: The URL to get history for.
206
+ count: Maximum number of snapshots to return (default: 10).
207
+
208
+ Returns:
209
+ List of snapshot dicts (without full markdown content), newest first.
210
+ """
211
+ snapshots = self._db.get_snapshots(url, count)
212
+ # Return metadata only, exclude full markdown for brevity
213
+ return [
214
+ {
215
+ "id": s["id"],
216
+ "url": s["url"],
217
+ "content_hash": s["content_hash"],
218
+ "title": s["title"],
219
+ "word_count": s["word_count"],
220
+ "captured_at": s["captured_at"],
221
+ }
222
+ for s in snapshots
223
+ ]
224
+
225
+ async def untrack(self, url: str) -> str:
226
+ """Remove URL from tracking and delete all its snapshots.
227
+
228
+ Args:
229
+ url: The URL to untrack.
230
+
231
+ Returns:
232
+ Status message.
233
+ """
234
+ removed = self._db.remove_tracked_url(url)
235
+ if removed:
236
+ return f"Untracked: {url}"
237
+ return f"URL was not being tracked: {url}"
238
+
239
+ async def close(self) -> None:
240
+ """Close the database connection."""
241
+ self._db.close()
diffgrab/visual.py ADDED
@@ -0,0 +1,181 @@
1
+ """Visual diff — optional snapgrab integration for screenshot comparison."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ try:
12
+ from snapgrab import capture as sg_capture
13
+
14
+ _SNAPGRAB_AVAILABLE = True
15
+ except ImportError:
16
+ _SNAPGRAB_AVAILABLE = False
17
+
18
+ try:
19
+ from PIL import Image, ImageChops
20
+
21
+ _PILLOW_AVAILABLE = True
22
+ except ImportError:
23
+ _PILLOW_AVAILABLE = False
24
+
25
+
26
+ @dataclass
27
+ class VisualDiffResult:
28
+ """Result of visual comparison between two screenshots."""
29
+
30
+ url: str
31
+ changed: bool
32
+ before_path: str = ""
33
+ after_path: str = ""
34
+ diff_path: str = ""
35
+ pixel_change_ratio: float = 0.0
36
+ error: str = ""
37
+
38
+
39
+ def is_available() -> bool:
40
+ """Check if visual diff dependencies are installed."""
41
+ return _SNAPGRAB_AVAILABLE
42
+
43
+
44
+ async def capture_screenshot(url: str, output_dir: str = "/tmp/diffgrab") -> str:
45
+ """Capture a screenshot of a URL using snapgrab.
46
+
47
+ Args:
48
+ url: The URL to capture.
49
+ output_dir: Directory to store the screenshot.
50
+
51
+ Returns:
52
+ Path to the screenshot file.
53
+
54
+ Raises:
55
+ ImportError: If snapgrab is not installed.
56
+ """
57
+ if not _SNAPGRAB_AVAILABLE:
58
+ raise ImportError("snapgrab is not installed. Run: pip install 'diffgrab[visual]'")
59
+
60
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
61
+ result = await sg_capture(url, output_dir=output_dir, full_page=True)
62
+ return result.path
63
+
64
+
65
+ def compute_pixel_diff(before_path: str, after_path: str, output_path: str = "") -> tuple[bool, float, str]:
66
+ """Compare two screenshots pixel-by-pixel.
67
+
68
+ Args:
69
+ before_path: Path to the before screenshot.
70
+ after_path: Path to the after screenshot.
71
+ output_path: Optional path to save the diff image.
72
+
73
+ Returns:
74
+ Tuple of (changed, pixel_change_ratio, diff_path).
75
+
76
+ Raises:
77
+ ImportError: If Pillow is not installed.
78
+ """
79
+ if not _PILLOW_AVAILABLE:
80
+ raise ImportError("Pillow is not installed. Run: pip install Pillow")
81
+
82
+ img_before = Image.open(before_path).convert("RGB")
83
+ img_after = Image.open(after_path).convert("RGB")
84
+
85
+ # Resize to same dimensions if needed
86
+ if img_before.size != img_after.size:
87
+ target_w = max(img_before.width, img_after.width)
88
+ target_h = max(img_before.height, img_after.height)
89
+ img_before = img_before.resize((target_w, target_h))
90
+ img_after = img_after.resize((target_w, target_h))
91
+
92
+ diff_img = ImageChops.difference(img_before, img_after)
93
+
94
+ # Calculate change ratio
95
+ total_pixels = diff_img.width * diff_img.height
96
+ if total_pixels == 0:
97
+ return False, 0.0, ""
98
+
99
+ # Count non-zero pixels (changed pixels)
100
+ changed_pixels = 0
101
+ for pixel in diff_img.getdata():
102
+ if pixel != (0, 0, 0):
103
+ changed_pixels += 1
104
+
105
+ ratio = changed_pixels / total_pixels
106
+ changed = ratio > 0.001 # threshold: 0.1% pixels changed
107
+
108
+ diff_path = ""
109
+ if output_path and changed:
110
+ diff_img.save(output_path)
111
+ diff_path = output_path
112
+
113
+ return changed, ratio, diff_path
114
+
115
+
116
+ async def visual_diff(
117
+ url: str,
118
+ before_path: str | None = None,
119
+ after_path: str | None = None,
120
+ output_dir: str = "/tmp/diffgrab",
121
+ ) -> VisualDiffResult:
122
+ """Perform visual diff between two screenshots of a URL.
123
+
124
+ If before_path or after_path are not provided, captures new screenshot(s).
125
+
126
+ Args:
127
+ url: The URL being compared.
128
+ before_path: Path to the before screenshot (or None to skip).
129
+ after_path: Path to the after screenshot (or None to capture new).
130
+ output_dir: Directory for screenshots and diff images.
131
+
132
+ Returns:
133
+ VisualDiffResult with comparison details.
134
+ """
135
+ if not _SNAPGRAB_AVAILABLE:
136
+ return VisualDiffResult(
137
+ url=url,
138
+ changed=False,
139
+ error="snapgrab is not installed. Run: pip install 'diffgrab[visual]'",
140
+ )
141
+
142
+ try:
143
+ if after_path is None:
144
+ after_path = await capture_screenshot(url, output_dir=output_dir)
145
+
146
+ if before_path is None:
147
+ return VisualDiffResult(
148
+ url=url,
149
+ changed=False,
150
+ after_path=after_path,
151
+ error="No before screenshot available for comparison.",
152
+ )
153
+
154
+ if not _PILLOW_AVAILABLE:
155
+ return VisualDiffResult(
156
+ url=url,
157
+ changed=False,
158
+ before_path=before_path,
159
+ after_path=after_path,
160
+ error="Pillow is not installed for pixel comparison.",
161
+ )
162
+
163
+ diff_output = str(Path(output_dir) / "diff.png")
164
+ changed, ratio, diff_path = compute_pixel_diff(before_path, after_path, diff_output)
165
+
166
+ return VisualDiffResult(
167
+ url=url,
168
+ changed=changed,
169
+ before_path=before_path,
170
+ after_path=after_path,
171
+ diff_path=diff_path,
172
+ pixel_change_ratio=ratio,
173
+ )
174
+
175
+ except Exception as exc:
176
+ logger.error("Visual diff failed for %s: %s", url, exc)
177
+ return VisualDiffResult(
178
+ url=url,
179
+ changed=False,
180
+ error=str(exc),
181
+ )