khiip 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. khiip-0.2.0/.gitignore +39 -0
  2. khiip-0.2.0/CHANGELOG.md +489 -0
  3. khiip-0.2.0/CODE_OF_CONDUCT.md +31 -0
  4. khiip-0.2.0/CONTRIBUTING.md +140 -0
  5. khiip-0.2.0/DISCLAIMER.md +98 -0
  6. khiip-0.2.0/LICENSE +662 -0
  7. khiip-0.2.0/PKG-INFO +360 -0
  8. khiip-0.2.0/README.md +308 -0
  9. khiip-0.2.0/SECURITY.md +83 -0
  10. khiip-0.2.0/pyproject.toml +146 -0
  11. khiip-0.2.0/src/khiip/__init__.py +0 -0
  12. khiip-0.2.0/src/khiip/auth.py +76 -0
  13. khiip-0.2.0/src/khiip/cli.py +436 -0
  14. khiip-0.2.0/src/khiip/config.py +478 -0
  15. khiip-0.2.0/src/khiip/daemon.py +1443 -0
  16. khiip-0.2.0/src/khiip/embeddings/__init__.py +12 -0
  17. khiip-0.2.0/src/khiip/embeddings/base.py +51 -0
  18. khiip-0.2.0/src/khiip/embeddings/compose.py +391 -0
  19. khiip-0.2.0/src/khiip/embeddings/minilm.py +61 -0
  20. khiip-0.2.0/src/khiip/extractors/__init__.py +31 -0
  21. khiip-0.2.0/src/khiip/extractors/base.py +196 -0
  22. khiip-0.2.0/src/khiip/extractors/media_fetchers.py +781 -0
  23. khiip-0.2.0/src/khiip/extractors/pdf.py +563 -0
  24. khiip-0.2.0/src/khiip/extractors/reddit.py +714 -0
  25. khiip-0.2.0/src/khiip/extractors/reddit_common.py +783 -0
  26. khiip-0.2.0/src/khiip/extractors/resilience.py +129 -0
  27. khiip-0.2.0/src/khiip/extractors/resilience_media.py +400 -0
  28. khiip-0.2.0/src/khiip/extractors/web.py +332 -0
  29. khiip-0.2.0/src/khiip/extractors/web_common.py +872 -0
  30. khiip-0.2.0/src/khiip/extractors/wiki.py +740 -0
  31. khiip-0.2.0/src/khiip/extractors/x.py +571 -0
  32. khiip-0.2.0/src/khiip/extractors/x_common.py +355 -0
  33. khiip-0.2.0/src/khiip/extractors/youtube.py +485 -0
  34. khiip-0.2.0/src/khiip/extractors/youtube_common.py +590 -0
  35. khiip-0.2.0/src/khiip/mcp/__init__.py +27 -0
  36. khiip-0.2.0/src/khiip/mcp/__main__.py +39 -0
  37. khiip-0.2.0/src/khiip/mcp/client.py +170 -0
  38. khiip-0.2.0/src/khiip/mcp/server.py +235 -0
  39. khiip-0.2.0/src/khiip/models.py +806 -0
  40. khiip-0.2.0/src/khiip/py.typed +0 -0
  41. khiip-0.2.0/src/khiip/renderers/__init__.py +143 -0
  42. khiip-0.2.0/src/khiip/renderers/base.py +246 -0
  43. khiip-0.2.0/src/khiip/renderers/json_renderer.py +68 -0
  44. khiip-0.2.0/src/khiip/renderers/legacy_passthrough.py +85 -0
  45. khiip-0.2.0/src/khiip/renderers/plain.py +560 -0
  46. khiip-0.2.0/src/khiip/renderers/registry.py +106 -0
  47. khiip-0.2.0/src/khiip/renderers/source_banner.py +145 -0
  48. khiip-0.2.0/src/khiip/renderers/status_callouts.py +106 -0
  49. khiip-0.2.0/src/khiip/renderers/transcript.py +79 -0
  50. khiip-0.2.0/src/khiip/renderers/vault_frontmatter.py +118 -0
  51. khiip-0.2.0/src/khiip/storage/__init__.py +0 -0
  52. khiip-0.2.0/src/khiip/storage/captures.py +356 -0
  53. khiip-0.2.0/src/khiip/storage/db.py +165 -0
  54. khiip-0.2.0/src/khiip/storage/embeddings_store.py +151 -0
  55. khiip-0.2.0/src/khiip/storage/filesystem.py +242 -0
  56. khiip-0.2.0/src/khiip/storage/schema.sql +214 -0
  57. khiip-0.2.0/src/khiip/storage/source_tier.py +183 -0
  58. khiip-0.2.0/src/khiip/validate.py +436 -0
  59. khiip-0.2.0/src/khiip/version.py +3 -0
  60. khiip-0.2.0/src/khiip/wayback.py +225 -0
  61. khiip-0.2.0/tests/__init__.py +0 -0
  62. khiip-0.2.0/tests/conftest.py +149 -0
  63. khiip-0.2.0/tests/test_carve_boundary.py +167 -0
  64. khiip-0.2.0/tests/test_cli.py +146 -0
  65. khiip-0.2.0/tests/test_config.py +321 -0
  66. khiip-0.2.0/tests/test_daemon_smoke.py +2011 -0
  67. khiip-0.2.0/tests/test_embeddings_compose.py +341 -0
  68. khiip-0.2.0/tests/test_embeddings_store.py +166 -0
  69. khiip-0.2.0/tests/test_env_isolation.py +129 -0
  70. khiip-0.2.0/tests/test_extractors.py +4676 -0
  71. khiip-0.2.0/tests/test_mcp.py +442 -0
  72. khiip-0.2.0/tests/test_media_fetchers.py +1405 -0
  73. khiip-0.2.0/tests/test_renderers.py +930 -0
  74. khiip-0.2.0/tests/test_resilience_media.py +480 -0
  75. khiip-0.2.0/tests/test_storage.py +662 -0
  76. khiip-0.2.0/tests/test_validate.py +276 -0
  77. khiip-0.2.0/tests/test_wayback.py +260 -0
khiip-0.2.0/.gitignore ADDED
@@ -0,0 +1,39 @@
1
+ # Secrets / credentials
2
+ .env
3
+ .env.*
4
+ *.key
5
+ *.pem
6
+ .aws/
7
+ .config/khiip/
8
+
9
+ # OS / editor
10
+ .DS_Store
11
+ .vscode/
12
+ .idea/
13
+ *.swp
14
+
15
+ # Python
16
+ __pycache__/
17
+ *.pyc
18
+ *.egg-info/
19
+ .venv/
20
+ venv/
21
+ uv.lock
22
+ dist/
23
+ build/
24
+ .pytest_cache/
25
+ .ruff_cache/
26
+ .python-version
27
+
28
+ # JS (plugin scaffolding when added)
29
+ node_modules/
30
+
31
+ # Runtime data (daemon writes locally)
32
+ *.sqlite
33
+ *.db
34
+ *.log
35
+
36
+ # Local-only working files — never committed to the public repo.
37
+ /CLAUDE.md
38
+ /_internal/
39
+ /khiip-internal/
@@ -0,0 +1,489 @@
1
+ # Changelog
2
+
3
+ All notable changes to **Khiip** are documented in this file. Format adapted
4
+ from [Keep a Changelog](https://keepachangelog.com/en/1.1.0/); versioning per
5
+ [Semantic Versioning](https://semver.org/spec/v2.0.0.html). v0.x is pre-1.0;
6
+ substrate decisions can still change until v1.0 stabilizes the contract.
7
+
8
+ ## [0.2.0] — 2026-06-26
9
+
10
+ ### Packaging
11
+
12
+ - **On PyPI.** `uv tool install khiip` / `pipx install khiip` install the daemon,
13
+ CLI, and MCP server directly — no clone required.
14
+ - **Open-core packaging.** The free AGPL core ships as `khiip`; the paid Khiip Plus
15
+ tier ships as a separate distribution. The published `khiip` sdist + wheel are
16
+ built from an allowlist and carry the core package only.
17
+ - Release automation: tag-driven build → carve guard → PyPI Trusted Publishing
18
+ (OIDC, no API token in the repo). CI now runs the hermetic test suite on every PR.
19
+
20
+ ## [0.1.8] — 2026-06-14
21
+
22
+ Captures can now be **re-derived offline from preserved bytes** — improve an
23
+ extractor or a renderer and every old capture upgrades, without re-fetching, and
24
+ even after the source post is deleted. This is the operational core of
25
+ capture-and-keep: the bytes are the durable asset; the typed payload and every
26
+ rendering are recomputable views over them. This release also deepens extraction
27
+ across every in-band source and gives each source a platform-faithful card in
28
+ Obsidian.
29
+
30
+ ### Added — substrate
31
+
32
+ - **`refetch --re-extract` (offline L1→L2 replay).** Re-derives the typed
33
+ payload from the **preserved source bytes** with the *current* extractor — no
34
+ network — recovering fields an older extractor missed and surviving source
35
+ deletion. **In place**: same capture id / vault path / `valid_from` (re-parsing
36
+ unchanged bytes is a derived-cache refresh, not a new observation — only the
37
+ network `extraction` dimension supersedes). Re-renders the body and re-embeds
38
+ for recall. New optional `ByteReExtractor` protocol method `extract_from_bytes`
39
+ implemented by all six extractors (web / reddit / x / wiki / youtube / pdf);
40
+ daemon dimension `re-extract`, CLI `--re-extract`, MCP `dimension="re-extract"`.
41
+ Returns a clear 400 when no bytes were preserved (use `extraction`). This
42
+ is the replayability property of the preserved Source-tier bytes.
43
+ - **`refetch --re-render` (offline L2→md replay).** Re-renders the markdown body
44
+ from the existing typed payload — apply an improved renderer or skin to a
45
+ capture (or the whole corpus) with no network, no re-extract, no re-embed. In
46
+ place. Daemon dimension `re-render`, CLI `--re-render`, MCP. Also a tight
47
+ developer loop for iterating renderers without re-fetching.
48
+ - **Richer extraction (Tier-1 completeness lifts).** Fields already fetched but
49
+ never lifted into the typed payload now land in L2 (so MCP/REST agents, recall,
50
+ and renders can use them): YouTube `tags` / `categories` / `video_id` /
51
+ `channel_id` / `channel_url`; Web article `author` (byline); Wikipedia
52
+ `last_modified` / `last_rev_id`. Wikipedia also now fetches and stores
53
+ `categories` (`prop=categories`) at capture. Renderers surface these where
54
+ platform-faithful (YouTube channel link, Web byline, Wikipedia categories +
55
+ "last edited" footer). Existing captures gain them via `--re-extract`.
56
+ - **Tier-2 deeper extraction.** More of each page is now typed into L2:
57
+ - **X** — author verification (`is_verified` / `verified_type` =
58
+ individual/business/government), typed polls (`Poll` / `PollChoice`),
59
+ and note-tweet inline bold; closed the vxtwitter media `alt_text` gap.
60
+ - **Reddit** — superscript + spoiler richtext preserved (`^(…)` / `>!…!<`)
61
+ instead of being flattened by the markdown converter.
62
+ - **Web** — `date_modified` / `date_published` / `publisher` / `breadcrumbs`
63
+ (JSON-LD with OG fallback), plus per-figure **caption + photo credit** (the
64
+ `<figure>`/`<figcaption>` standard + common credit patterns).
65
+ - **Web inline media recovery.** Inline article **images** and embedded **video**
66
+ that the markdown converter previously dropped are now preserved: trafilatura's
67
+ inline images render in document position with their caption/credit, and video
68
+ embeds (YouTube / Vimeo / Dailymotion / HTML5 `<video>`) are kept as a typed
69
+ `Media` entry plus a body link. Every figure is lifted into `WebPayload.media`
70
+ so agents/recall see the full image set. New `WebPayload.site_icon_url` (the
71
+ site favicon). All retroactive via `--re-extract`.
72
+ - **YouTube transcript export.** `GET /api/v1/captures/{id}?format=srt|vtt`
73
+ returns the stored transcript as SubRip or WebVTT — pure local conversion over
74
+ existing cues, no network; 404 for non-YouTube captures / missing transcripts.
75
+
76
+ ### Added — surfaces / renderers
77
+
78
+ - **Per-source platform-faithful cards.** X, Reddit, YouTube, Wikipedia, and Web
79
+ captures each render in a per-source coloured card with platform-appropriate
80
+ metadata (X status footer + engagement glyphs + polls; Reddit flair + vote
81
+ pills + comment tree; YouTube channel/stats). The typed payload stays clean —
82
+ the styling is a *derived* Obsidian view (the plugin ships the CSS); in any
83
+ other markdown viewer the card degrades to standard markdown + content.
84
+ - **Web news-masthead.** Web captures now read like a news-article header: a
85
+ `Web · <Type>` banner (News Article / Blog / Review / How-To / Recipe, from
86
+ schema.org), a brand band carrying the site **favicon + name + section**, the
87
+ headline, and an `By <author> · published · updated` byline; the description
88
+ becomes the standfirst dek. Tags render as individual hashtags.
89
+
90
+ ### Fixed
91
+
92
+ - **Superseded marker preserved on in-place rewrites.** Re-extract / re-render /
93
+ media / wayback refreshes no longer strip the `superseded_by` banner +
94
+ frontmatter from an older note (it's display-only state; the DB was always
95
+ correct).
96
+ - **Web inline links no longer scrambled.** The article body is now serialized
97
+ via trafilatura HTML → markdownify, fixing displaced/duplicated link text that
98
+ trafilatura's own markdown serializer produced on link-dense articles.
99
+ - **Reddit admin/mod badges detected.** Distinguished status is read from the
100
+ author anchor (`<a class="author admin">`) rather than the comment container,
101
+ so admin/moderator comments are now flagged (previously none were). Backfills
102
+ via `--re-extract`.
103
+ - **Comma-joined tags split correctly.** A `keywords` / `article:tag` value
104
+ emitted as a single `"a,b,c"` string (e.g. The Guardian) now splits into
105
+ separate hashtags instead of collapsing into one long underscore chain.
106
+
107
+ ## [0.1.7] — 2026-06-10
108
+
109
+ Captures now carry a per-source identity banner so each source is recognisable
110
+ at a glance in Obsidian.
111
+
112
+ ### Added — substrate
113
+
114
+ - **Per-source identity banner.** Each capture's rendered body opens with a
115
+ `> [!khiip-<source>]` callout carrying a label + identifier (e.g.
116
+ `𝕏 · @jack`, `Reddit · r/selfhosted`, `YouTube · 3Blue1Brown`). It's plain
117
+ Markdown — in any viewer it degrades to an ordinary titled callout — so the
118
+ substrate stays destination-agnostic; the *visual* per-source colour + icon
119
+ is supplied by the consumer (the Obsidian plugin ships CSS targeting
120
+ `.callout[data-callout="khiip-<source>"]`). New `renderers/source_banner.py`
121
+ (`compose_source_banner`); prepended in the capture + supersede write paths.
122
+ This is the lightweight in-vault version of the deferred Phase 2.5+
123
+ source-styled renderer.
124
+
125
+ ## [0.1.6] — 2026-06-10
126
+
127
+ Refetch versioning is now fork-safe, and recall returns one hit per URL. A live
128
+ Obsidian-plugin smoke surfaced that repeatedly re-fetching the same URL (e.g. a
129
+ "refresh" button, or an MCP agent holding a stale capture id) forked the
130
+ supersession chain into multiple "current" captures for one URL.
131
+
132
+ ### Fixed — substrate
133
+
134
+ - **`refetch` (extraction dimension) now resolves to the chain head before
135
+ superseding.** Previously it marked whatever `capture_id` it was handed as
136
+ superseded — so re-fetching an already-superseded version overwrote that
137
+ version's `superseded_by` pointer and orphaned the in-between versions,
138
+ leaving several un-superseded "current" captures for a single URL. It now
139
+ looks up the live un-superseded capture for the URL (`find_capture_by_url_hash`)
140
+ and supersedes that, so refetch always extends a single linear chain
141
+ regardless of which version it was invoked from (append-only supersession).
142
+
143
+ ### Changed — substrate
144
+
145
+ - **Recall returns the current head of each supersession chain only** — one hit
146
+ per URL, not every refetched version. Superseded captures are pre-filtered out
147
+ of the ranking candidates (`storage_captures.superseded_capture_ids`) so they
148
+ don't consume top-k slots. Older versions remain on disk + in the DB
149
+ (retrievable by id); they're just out of the default recall surface.
150
+ - **Superseded notes are self-describing in the vault.** When a refetch
151
+ supersedes a capture, the old note is stamped with a `superseded_by`
152
+ frontmatter field + an `[!info] Superseded` banner (reusing the visual-
153
+ indicator callout layer), so an older version reads as old when opened in
154
+ Obsidian. The current head note carries neither — only the DB pointer existed
155
+ before, which was invisible when reading the `.md`.
156
+
157
+ ## [0.1.5] — 2026-06-10
158
+
159
+ Wayback archiving is now opt-in and quiet. A real-world end-to-end smoke of the
160
+ non-Reddit extractors found the anonymous Wayback Machine submission — which ran
161
+ on every capture — was both unreliable and slow.
162
+
163
+ ### Changed — substrate
164
+
165
+ - **Wayback URL-archiving is now OPT-IN (default off).** archive.org's
166
+ anonymous Save-Page-Now is rate-limited by design (fast HTTP 429s) and
167
+ frequently hangs on the target-crawl for minutes, so unauthenticated
168
+ archiving cannot be relied on. It is no longer presented as a default capture
169
+ feature (don't-over-promise). Enable best-effort anonymous archiving with
170
+ `[archive] wayback_enabled = true` in `config.toml`. Reliable archiving needs
171
+ an authenticated archive.org account; a bring-your-own-credentials (SPN2 /
172
+ S3-key) reliable tier is planned (see roadmap).
173
+ - **Wayback failures are now quiet.** A failed best-effort submission records
174
+ `archive_urls: {wayback: null}` in frontmatter but no
175
+ longer writes a `[!warning] Wayback archive failed` callout into the capture's
176
+ vault body — a tombstone over-stated an optional, known-unreliable witness.
177
+ - **Wayback inline timeout lowered 30s → 15s.** When enabled, a hung anonymous
178
+ submission could block the user-facing capture for the full 30s (a live probe
179
+ measured a successful submission at ~9s and routine multi-minute hangs). 15s
180
+ keeps genuine successes while halving the worst-case wait.
181
+ - **`/health` is honest about Wayback.** The wayback row is absent by default
182
+ (opt-in), and its docstring now states it is a *reachability-only* probe —
183
+ a green row means archive.org is reachable, **not** that archiving will
184
+ succeed (anonymous SPN is rate-limited/best-effort).
185
+
186
+ ## [0.1.4] — 2026-06-09
187
+
188
+ Reddit HTML-channel fidelity: the two gaps v0.1.3 documented are now closed,
189
+ both credential-free.
190
+
191
+ ### Changed — substrate
192
+
193
+ - **Deep comment trees expand credential-free.** The old.reddit HTML channel
194
+ previously stopped at "load more comments" links and marked the thread
195
+ `comments_truncated`. It now follows them via the same `/api/morechildren`
196
+ endpoint the page's own JS uses (no credentials needed), re-nesting the flat
197
+ response under each comment's true parent — old.reddit flattens deep
198
+ continuations to the bottom of the page, so structural position can't be
199
+ trusted; the real parent comes back in the API response. Expansion is bounded
200
+ by a paced request budget (`max_more_requests`, default 24) to stay under
201
+ Reddit's rate-sensitive WAF; when the budget is spent (or a fetch fails, or a
202
+ link isn't parseable) the residual `more` markers keep `comments_truncated`
203
+ honestly `True`. Best-effort augmentation: any expansion failure leaves the
204
+ base capture intact.
205
+ - **Galleries resolve to distinct full-resolution images.** Gallery media is
206
+ now read from old.reddit's authoritative `.gallery-tile[data-media-id]` list
207
+ (ordered, deduplicated) and mapped to full-res `i.redd.it/<id>.<ext>` URLs.
208
+ Previously the post tile's `preview.redd.it` thumbnails were scraped and
209
+ keyed by full URL, so the same image at several widths produced duplicate
210
+ gallery items pointing at thumbnails. Falls back to a preview-URL scrape
211
+ (deduped by media id) when the structured tiles are absent.
212
+
213
+ Both channels still treat OAuth (when a Reddit app is configured) as the
214
+ fidelity/rate upgrade: 60 req/min headroom for very large threads beyond the
215
+ HTML budget, plus gallery dimensions/captions the HTML tiles don't expose.
216
+
217
+ ## [0.1.3] — 2026-06-09
218
+
219
+ Credential-free Reddit capture. Reddit previously required every user to
220
+ register their own OAuth app (`client_id`/`client_secret`) before any Reddit
221
+ URL could be captured — a real adoption friction and the only source that
222
+ reported `degraded` on a fresh install. Reddit's anonymous `.json` API is
223
+ WAF-blocked, but the **old.reddit.com HTML** site still serves live threads +
224
+ full comment trees to a browser-shaped request. v0.1.3 adds that as the
225
+ **default** channel, so Reddit works out of the box; OAuth becomes an optional
226
+ higher-fidelity / higher-rate-limit upgrade.
227
+
228
+ ### Changed — substrate
229
+
230
+ - **`RedditExtractor` is now credential-free by default (shape A).** `extract()`
231
+ runs a fallback chain: when a Reddit app is configured, the OAuth-JSON channel
232
+ is tried first (cleaner comment pagination + 60 req/min headroom) and the
233
+ old.reddit-HTML channel backstops it; with no app, capture still works via
234
+ HTML. Both channels emit the same two-Listing shape, so all typed assembly
235
+ (`post_type` / media / `removed_status` / `is_op` / crosspost / comment
236
+ recursion) is shared verbatim. A bad/expired OAuth app now degrades gracefully
237
+ to HTML instead of failing the capture.
238
+ - **`/health` for Reddit is green credential-free.** A missing Reddit app is no
239
+ longer `degraded` — only an unreachable old.reddit is. `fallback_count`
240
+ reflects the wired channels (2 with creds, 1 without).
241
+
242
+ ### Added — substrate
243
+
244
+ - **Old.reddit HTML channel** — forces the `old.reddit.com` host + `?limit=500`
245
+ for the deep comment tree, sends the `over18` cookie to clear the NSFW
246
+ interstitial, and paces + retries on the rate-sensitive WAF's transient 403s.
247
+ Raw HTML is preserved to the Source-tier (`.html.gz`). Parses
248
+ post fields, self-text (rendered → markdown), media URLs, the recursive
249
+ comment tree, `removed`/`deleted` status, and truncation. *Known fidelity
250
+ gaps vs the OAuth channel: galleries are best-effort; very deep trees still
251
+ truncate (`comments_truncated`). old.reddit is self-host-only by design — a
252
+ datacenter IP would hit the WAF and `robots.txt`.*
253
+ - **`RedditPayload.extractor_source`** gains `"reddit-html"` so the typed
254
+ payload self-describes which channel produced it.
255
+
256
+ ### CLI
257
+
258
+ - **`khiipd capture` / `recall` / `refetch` no longer leak `httpx` tracebacks**
259
+ on a slow/timed-out request — they print a clean message (capture/refetch
260
+ note the daemon may still be finishing server-side) and exit non-zero. Default
261
+ `capture` / `refetch` timeouts raised to 120s so slow sources (YouTube,
262
+ Wikipedia) don't trip the old 30s/60s limits on the happy path.
263
+
264
+ ## [0.1.2] — 2026-06-08
265
+
266
+ Dedicated Wikipedia extractor. Wikipedia URLs were previously captured by the
267
+ generic `WebExtractor` (emitting a `WebPayload`); the `WikiPayload` type +
268
+ renderer + embed-composition shipped at v0.1 but no extractor produced one.
269
+ `WikiExtractor` completes that scaffolding by talking to the MediaWiki API
270
+ directly — the one generic-web source with a first-class, versioned API.
271
+
272
+ ### Added — substrate
273
+
274
+ - **`khiip.extractors.wiki.WikiExtractor`** — `*.wikipedia.org` `/wiki/<Title>`
275
+ article URLs (language subdomains + `.m.` mobile variants). Registered before
276
+ `WebExtractor` in the default registry so Wikipedia URLs are claimed here
277
+ rather than by the http(s) catch-all. Emits a typed `WikiPayload`
278
+ (`source="wiki"`).
279
+ - **Two-source fallback chain** (the resilience pattern shared across all
280
+ extractors):
281
+ - **MediaWiki action API** (primary) — `action=query&prop=extracts|
282
+ pageimages|info&explaintext=1&exsectionformat=raw` for a clean plain-text
283
+ body parsed into structured `sections` (no fragile HTML scraping for the
284
+ load-bearing fields) + page image + canonical URL; `action=parse&prop=text`
285
+ rendered HTML parsed **best-effort** for `references` + `infobox` (the two
286
+ fields with no clean JSON surface — a parse miss degrades only those).
287
+ - **REST v1 summary** (fallback) — `/api/rest_v1/page/summary/{title}` for a
288
+ thinner lead-only payload via an orthogonal endpoint family.
289
+ - **Source-tier raw-bytes preservation** — the structured query
290
+ JSON is gzipped + retained for future re-derivation.
291
+ - **`health_check()`** probing the REST summary endpoint for the en "Wikipedia"
292
+ article (5s short-timeout), surfaced on `/health` like the other extractors.
293
+ - **CC BY-SA 4.0 attribution** populated on every Wikipedia capture
294
+ (`contributors_attribution`), per Wikipedia's reuse license.
295
+
296
+ ### Notes
297
+
298
+ - Foundation-vs-on-top preserved: new module + registration only; the v0.1.1
299
+ substrate (`WikiPayload`, `WikiMarkdownRenderer`, `_compose_wiki`,
300
+ `SourceName`) was already in place and is unchanged. +49 tests.
301
+
302
+ ## [0.1.1] — 2026-05-29
303
+
304
+ MCP server. Six tools (`capture_url`, `recall`, `list_captures`,
305
+ `get_capture`, `refetch_capture`, `daemon_status`) over stdio for Claude
306
+ Desktop / Cursor / any MCP-aware client. Thin HTTP proxy architecture:
307
+ the MCP server connects to a running Khiip
308
+ daemon at `127.0.0.1:8478`, reads the Bearer auth token from
309
+ `~/.config/khiip/auth.toml`, and translates MCP tool calls to REST.
310
+ Daemon must be running before tool calls land.
311
+
312
+ ### Added — substrate
313
+
314
+ - **`khiip.mcp` subpackage** — `DaemonClient` (httpx-based REST proxy),
315
+ `build_server` (FastMCP-based server factory with six tools registered),
316
+ `__main__` (entry point for the `khiip-mcp-server` console script).
317
+ - **`khiip-mcp-server` console script** wired in `pyproject.toml`. Used by
318
+ Claude Desktop / Cursor MCP configs to launch the stdio server.
319
+ - **`mcp>=1.27,<2` dependency** added to `dependencies` (bundled with the
320
+ daemon — no optional `[mcp]` extra). Aligns with the launch-essay +
321
+ Show HN claim that v0.1 ships REST + MCP together.
322
+ - **Six MCP tools** mapping 1:1 to existing REST endpoints:
323
+ - `capture_url(url, force_new=False)` → `POST /api/v1/captures`
324
+ - `recall(query, limit=10)` → `GET /api/v1/recall`
325
+ - `list_captures(source=None, limit=50, offset=0)` → `GET /api/v1/captures`
326
+ - `get_capture(capture_id, format="json")` → `GET /api/v1/captures/{id}`
327
+ - `refetch_capture(capture_id, dimension="extraction")` → `POST /api/v1/captures/{id}/refetch`
328
+ - `daemon_status()` → `GET /health` + `GET /api/v1/meta` combined
329
+ - **Structured error responses** — daemon-unreachable + HTTP-error
330
+ translation into LLM-readable `{"error": ..., "detail": ...}` dicts so
331
+ agents get actionable failure messages instead of opaque MCP errors.
332
+ - **`KHIIP_DAEMON_URL` env var override** for pointing the MCP server at a
333
+ non-default daemon (useful for sandboxes / dev iterations).
334
+
335
+ ### Added — tests
336
+
337
+ - `tests/test_mcp.py` (21 tests; +21 to v0.1.0's 747 baseline = 768 green
338
+ at v0.1.1). Coverage:
339
+ - `DaemonClient` unit tests against `httpx.MockTransport` (auth headers,
340
+ query-param threading, error-translation, env-var override)
341
+ - FastMCP tool tests with an injected fake client (argument threading,
342
+ return-shape envelopes, structured-error translation)
343
+ - Tool-surface lock: asserts exactly 6 tools registered at v0.1.1
344
+ - End-to-end smoke verified by launching `khiip-mcp-server` via the
345
+ official `mcp` Python SDK client; `list_tools` returns all six.
346
+
347
+ ### Architectural notes
348
+
349
+ - **REST stays canonical at v0.1.1.** MCP is a transport translator over
350
+ the REST surface. Service-tier refactor (extracting orchestration into
351
+ `khiip.services.*` functions both REST and MCP would call directly) is
352
+ deferred to a forcing function — a hosted/productized surface or
353
+ a third transport. The migration is low-cost: tool signatures, Claude
354
+ Desktop config, and behavior stay stable across the eventual refactor.
355
+ - **Foundation-vs-on-top discipline:** all v0.1.1 changes live in the new
356
+ `khiip.mcp` subpackage. No existing module (extractors, renderers,
357
+ daemon routes, storage, embeddings, wayback) was modified. The v0.1.0
358
+ test suite continues to pass unchanged (747/747).
359
+
360
+ ### Stale-context cleanup
361
+
362
+ - **README** — TRUST.md / "8 immutable promises" / "future MCP server"
363
+ references stripped; License section reframed from "Promise 1" to
364
+ LICENSE-grounded framing; status checklist updated for v0.1.1.
365
+
366
+ ## [0.1.0] — 2026-05-25
367
+
368
+ First substantive substrate release. Six v0.1 sources (X, Reddit, Web,
369
+ Wikipedia, YouTube, PDF) emit typed Pydantic payloads end-to-end; full
370
+ Renderer Protocol+Registry tier; visual indicator system; Source-tier
371
+ raw-bytes preservation; typed-payload recall composition; vault↔SQLite
372
+ validator; granular refetch CLI; opt-in video preservation.
373
+
374
+ ### Added — substrate
375
+
376
+ - **Typed-payload pipeline** — all 6 v0.1 sources emit
377
+ Pydantic-typed payloads at the extractor boundary: `TweetPayload`,
378
+ `RedditPayload`, `WikiPayload`, `WebPayload`, `PDFPayload`,
379
+ `YouTubePayload`, plus cross-platform primitives (`EngagementCounts`,
380
+ `UrlEntity`, `Media`, `CommentNode`) and X-specific `XArticle` /
381
+ `XArticleBlock`. Discriminated union over `kind`.
382
+ - **Renderer Protocol + Registry** — 6 per-source
383
+ `MarkdownRenderer` impls + `LegacyMarkdownPassthroughRenderer` +
384
+ `JSONRenderer` + `VaultFrontmatterRenderer` dispatch first-supporting-
385
+ wins through `MarkdownRendererRegistry`.
386
+ - **Source-tier raw-bytes preservation** (URI-shaped) — original
387
+ upstream bytes gzipped + written under
388
+ user-configurable `data_root` (default `~/.local/share/khiip/sources/
389
+ <source>/<capture_id>.<ext>.gz`); URI-shaped `source_artifact_path`
390
+ generalizes to cloud (`s3://`) / Notion (`notion://`) at v0.5+.
391
+ - **MediaFetcherRegistry** — capability-based dispatch
392
+ across three fetchers: `HttpxFetcher` (photo CDN), `YtDlpFetcher`
393
+ (video via opt-in toggle; HLS/DASH/`v.redd.it`/YouTube),
394
+ `GalleryDlFetcher` (wide-coverage fallback).
395
+ - **Wayback Machine archive submission** (`archive_urls`) —
396
+ best-effort URL archival on every capture; opt-out via
397
+ `[archive] wayback_enabled = false`.
398
+ - **Visual indicator system** (three-layer model): layer 1
399
+ frontmatter-canonical, layer 2 renderer-composed Obsidian
400
+ callouts in vault body. Layer 3 plugin badges
401
+ deferred to v0.5+.
402
+ - **Embed-text composition** — per-source typed-payload
403
+ composition rules for the embedder; URL-fallback for failed-permanent
404
+ tombstones (capture-what-we-can).
405
+ - **Reddit OAuth2 channel** —
406
+ user registers their own Reddit app at https://www.reddit.com/prefs/apps;
407
+ client_id/secret via config.toml or env vars. Khiip-corporate never holds
408
+ upstream credentials.
409
+ - **YouTube fallback chain** —
410
+ yt-dlp → youtube-transcript-api + oEmbed → YouTube Data API v3 (operator
411
+ opt-in via API key); per-branch payload-shape fidelity.
412
+ - **Path-sandbox + defense-in-depth** at security boundaries: vault
413
+ `destination_path` validator, Source-tier
414
+ triple-guarded path validation (source/capture_id/ext rejection of
415
+ `..`/separators/NUL), media-fetcher subprocess argv-list invocation.
416
+
417
+ ### Added — surfaces
418
+
419
+ - **`khiipd serve`** — FastAPI daemon (default `127.0.0.1:8478`).
420
+ - **`khiipd capture <url>`** — capture a URL via the running daemon.
421
+ - **`khiipd recall <query>`** — semantic recall against the embedded
422
+ corpus.
423
+ - **`khiipd validate`** *(new in 0.1.0)* — vault ↔ SQLite consistency
424
+ checker. Four invariants: vault ↔ SQLite
425
+ bidirectional reconciliation, structured_payload Pydantic round-trip,
426
+ P-δ status family consistency, media local_path sandbox + existence.
427
+ Exit code 0 clean / 1 violations / 2 missing inputs. `--json` for
428
+ machine-readable output.
429
+ - **`khiipd refetch <id> [--media|--wayback]`** *(new in 0.1.0)* —
430
+ granular refetch. Three dimensions
431
+ independently re-attemptable: extraction (default; re-run extractor;
432
+ new capture; old gets `superseded_by`), media (re-walk
433
+ MediaFetcherRegistry in place), wayback (re-submit + update
434
+ `archive_urls`).
435
+ - **`khiipd auth show|rotate`** — manage the daemon API key.
436
+ - **REST `?format=` selector** on `GET /api/v1/captures/{id}` —
437
+ `capture` (default; full JSON envelope) / `json` (payload only via
438
+ JSONRenderer) / `markdown` (vault file) / `legacy-markdown` (body
439
+ only).
440
+ - **REST `POST /api/v1/captures/{id}/refetch?dimension=...`** *(new in
441
+ 0.1.0)* — the daemon-side endpoint the CLI wraps.
442
+ - **REST `?force_new=true`** on `POST /api/v1/captures` — bypass dedup;
443
+ used internally by refetch extraction dimension.
444
+
445
+ ### Added — configuration
446
+
447
+ - `[storage] data_root` — user-configurable Source-tier root (default
448
+ XDG `~/.local/share/khiip/`). Points anywhere on a filesystem:
449
+ iCloud Drive / Dropbox / external SSD / network mount for
450
+ cross-machine sync.
451
+ - `[media] download_videos` *(new in 0.1.0)* — opt-in default-off toggle
452
+ for video preservation. When `true`,
453
+ `YtDlpFetcher` registers in the default chain + `GalleryDlFetcher`
454
+ claims `media.type == "video"` (non-streaming-manifest URLs).
455
+ - `[media] fetch_enabled` / `[media] fetcher_disabled` — per-source +
456
+ per-fetcher toggles.
457
+ - `[archive] wayback_enabled` — Wayback opt-out (default true).
458
+ - `[extractors.reddit] client_id` / `client_secret` (or
459
+ `KHIIP_REDDIT_CLIENT_ID` / `KHIIP_REDDIT_CLIENT_SECRET` env vars).
460
+ - `[extractors.youtube] api_key` (or `KHIIP_YOUTUBE_API_KEY` env var).
461
+
462
+ ### Changed
463
+
464
+ - **Reddit channel** revised from anonymous `.json` to OAuth2 against
465
+ `oauth.reddit.com`.
466
+ - **Vault frontmatter is canonical** for the Payload tier; SQLite
467
+ `captures` + `embeddings` tables become derived caches rebuildable
468
+ from vault.
469
+ - **`content_sha256`** rebases to hash the embed-text composition
470
+ so cosmetic rendering changes don't trigger re-embedding.
471
+
472
+ ### Documented
473
+
474
+ - Foundational architecture decisions recorded: the graph schema, the
475
+ extractor Protocol, append-only edges, the Source/Payload/Render
476
+ three-tier model, and the P-δ failure-handling principle.
477
+ - Substrate-vs-surfaces architectural axis: Khiip = one canonical substrate +
478
+ N surfaces (Obsidian plugin / REST / MCP / Chrome ext / mobile /
479
+ Telegram bot / LLM-agent SDK consumers all speak the unified contract).
480
+
481
+ ### Notes
482
+
483
+ - This release lays the substrate baseline. Plugin UI rendering of the
484
+ new typed-payload + visual indicator + refetch surfaces lands at v0.5+
485
+ premium UX scope.
486
+ - Pre-public-launch posture statement for SD-4 video-binary capability is
487
+ separately tracked (1-2 hr authoring; not blocking this tag).
488
+
489
+ [0.2.0]: https://github.com/KhiipAI/khiip/releases/tag/v0.2.0
@@ -0,0 +1,31 @@
1
+ # Code of Conduct
2
+
3
+ This project adopts the **[Contributor Covenant 2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct/)** as our Code of Conduct.
4
+
5
+ The full text is available at the link above. In short: we expect all contributors and community members to engage respectfully, constructively, and with good faith — and to extend the same to others regardless of background, experience level, or identity.
6
+
7
+ ## Enforcement
8
+
9
+ Instances of unacceptable behavior may be reported to the project maintainers at:
10
+
11
+ **Email:** [hello@khiip.com](mailto:hello@khiip.com)
12
+
13
+ All reports will be reviewed and investigated promptly and fairly. The project team is obligated to respect the privacy and security of the reporter of any incident.
14
+
15
+ ## Scope
16
+
17
+ This Code of Conduct applies within all community spaces — including but not limited to:
18
+
19
+ - The Khiip GitHub repository (issues, PRs, discussions, code comments)
20
+ - Official Khiip communication channels (email, Discord when launched, etc.)
21
+ - When an individual is officially representing the community in public spaces
22
+
23
+ ## Attribution
24
+
25
+ This Code of Conduct adopts the Contributor Covenant, version 2.1, available at https://www.contributor-covenant.org/version/2/1/code_of_conduct/
26
+
27
+ For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq
28
+
29
+ ---
30
+
31
+ *Khiip Code of Conduct v1.0; last updated 2026-05-17. Adopts Contributor Covenant 2.1 by reference.*