khiip 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khiip-0.2.0/.gitignore +39 -0
- khiip-0.2.0/CHANGELOG.md +489 -0
- khiip-0.2.0/CODE_OF_CONDUCT.md +31 -0
- khiip-0.2.0/CONTRIBUTING.md +140 -0
- khiip-0.2.0/DISCLAIMER.md +98 -0
- khiip-0.2.0/LICENSE +662 -0
- khiip-0.2.0/PKG-INFO +360 -0
- khiip-0.2.0/README.md +308 -0
- khiip-0.2.0/SECURITY.md +83 -0
- khiip-0.2.0/pyproject.toml +146 -0
- khiip-0.2.0/src/khiip/__init__.py +0 -0
- khiip-0.2.0/src/khiip/auth.py +76 -0
- khiip-0.2.0/src/khiip/cli.py +436 -0
- khiip-0.2.0/src/khiip/config.py +478 -0
- khiip-0.2.0/src/khiip/daemon.py +1443 -0
- khiip-0.2.0/src/khiip/embeddings/__init__.py +12 -0
- khiip-0.2.0/src/khiip/embeddings/base.py +51 -0
- khiip-0.2.0/src/khiip/embeddings/compose.py +391 -0
- khiip-0.2.0/src/khiip/embeddings/minilm.py +61 -0
- khiip-0.2.0/src/khiip/extractors/__init__.py +31 -0
- khiip-0.2.0/src/khiip/extractors/base.py +196 -0
- khiip-0.2.0/src/khiip/extractors/media_fetchers.py +781 -0
- khiip-0.2.0/src/khiip/extractors/pdf.py +563 -0
- khiip-0.2.0/src/khiip/extractors/reddit.py +714 -0
- khiip-0.2.0/src/khiip/extractors/reddit_common.py +783 -0
- khiip-0.2.0/src/khiip/extractors/resilience.py +129 -0
- khiip-0.2.0/src/khiip/extractors/resilience_media.py +400 -0
- khiip-0.2.0/src/khiip/extractors/web.py +332 -0
- khiip-0.2.0/src/khiip/extractors/web_common.py +872 -0
- khiip-0.2.0/src/khiip/extractors/wiki.py +740 -0
- khiip-0.2.0/src/khiip/extractors/x.py +571 -0
- khiip-0.2.0/src/khiip/extractors/x_common.py +355 -0
- khiip-0.2.0/src/khiip/extractors/youtube.py +485 -0
- khiip-0.2.0/src/khiip/extractors/youtube_common.py +590 -0
- khiip-0.2.0/src/khiip/mcp/__init__.py +27 -0
- khiip-0.2.0/src/khiip/mcp/__main__.py +39 -0
- khiip-0.2.0/src/khiip/mcp/client.py +170 -0
- khiip-0.2.0/src/khiip/mcp/server.py +235 -0
- khiip-0.2.0/src/khiip/models.py +806 -0
- khiip-0.2.0/src/khiip/py.typed +0 -0
- khiip-0.2.0/src/khiip/renderers/__init__.py +143 -0
- khiip-0.2.0/src/khiip/renderers/base.py +246 -0
- khiip-0.2.0/src/khiip/renderers/json_renderer.py +68 -0
- khiip-0.2.0/src/khiip/renderers/legacy_passthrough.py +85 -0
- khiip-0.2.0/src/khiip/renderers/plain.py +560 -0
- khiip-0.2.0/src/khiip/renderers/registry.py +106 -0
- khiip-0.2.0/src/khiip/renderers/source_banner.py +145 -0
- khiip-0.2.0/src/khiip/renderers/status_callouts.py +106 -0
- khiip-0.2.0/src/khiip/renderers/transcript.py +79 -0
- khiip-0.2.0/src/khiip/renderers/vault_frontmatter.py +118 -0
- khiip-0.2.0/src/khiip/storage/__init__.py +0 -0
- khiip-0.2.0/src/khiip/storage/captures.py +356 -0
- khiip-0.2.0/src/khiip/storage/db.py +165 -0
- khiip-0.2.0/src/khiip/storage/embeddings_store.py +151 -0
- khiip-0.2.0/src/khiip/storage/filesystem.py +242 -0
- khiip-0.2.0/src/khiip/storage/schema.sql +214 -0
- khiip-0.2.0/src/khiip/storage/source_tier.py +183 -0
- khiip-0.2.0/src/khiip/validate.py +436 -0
- khiip-0.2.0/src/khiip/version.py +3 -0
- khiip-0.2.0/src/khiip/wayback.py +225 -0
- khiip-0.2.0/tests/__init__.py +0 -0
- khiip-0.2.0/tests/conftest.py +149 -0
- khiip-0.2.0/tests/test_carve_boundary.py +167 -0
- khiip-0.2.0/tests/test_cli.py +146 -0
- khiip-0.2.0/tests/test_config.py +321 -0
- khiip-0.2.0/tests/test_daemon_smoke.py +2011 -0
- khiip-0.2.0/tests/test_embeddings_compose.py +341 -0
- khiip-0.2.0/tests/test_embeddings_store.py +166 -0
- khiip-0.2.0/tests/test_env_isolation.py +129 -0
- khiip-0.2.0/tests/test_extractors.py +4676 -0
- khiip-0.2.0/tests/test_mcp.py +442 -0
- khiip-0.2.0/tests/test_media_fetchers.py +1405 -0
- khiip-0.2.0/tests/test_renderers.py +930 -0
- khiip-0.2.0/tests/test_resilience_media.py +480 -0
- khiip-0.2.0/tests/test_storage.py +662 -0
- khiip-0.2.0/tests/test_validate.py +276 -0
- khiip-0.2.0/tests/test_wayback.py +260 -0
khiip-0.2.0/.gitignore
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Secrets / credentials
|
|
2
|
+
.env
|
|
3
|
+
.env.*
|
|
4
|
+
*.key
|
|
5
|
+
*.pem
|
|
6
|
+
.aws/
|
|
7
|
+
.config/khiip/
|
|
8
|
+
|
|
9
|
+
# OS / editor
|
|
10
|
+
.DS_Store
|
|
11
|
+
.vscode/
|
|
12
|
+
.idea/
|
|
13
|
+
*.swp
|
|
14
|
+
|
|
15
|
+
# Python
|
|
16
|
+
__pycache__/
|
|
17
|
+
*.pyc
|
|
18
|
+
*.egg-info/
|
|
19
|
+
.venv/
|
|
20
|
+
venv/
|
|
21
|
+
uv.lock
|
|
22
|
+
dist/
|
|
23
|
+
build/
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.ruff_cache/
|
|
26
|
+
.python-version
|
|
27
|
+
|
|
28
|
+
# JS (plugin scaffolding when added)
|
|
29
|
+
node_modules/
|
|
30
|
+
|
|
31
|
+
# Runtime data (daemon writes locally)
|
|
32
|
+
*.sqlite
|
|
33
|
+
*.db
|
|
34
|
+
*.log
|
|
35
|
+
|
|
36
|
+
# Local-only working files — never committed to the public repo.
|
|
37
|
+
/CLAUDE.md
|
|
38
|
+
/_internal/
|
|
39
|
+
/khiip-internal/
|
khiip-0.2.0/CHANGELOG.md
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to **Khiip** are documented in this file. Format adapted
|
|
4
|
+
from [Keep a Changelog](https://keepachangelog.com/en/1.1.0/); versioning per
|
|
5
|
+
[Semantic Versioning](https://semver.org/spec/v2.0.0.html). v0.x is pre-1.0;
|
|
6
|
+
substrate decisions can still change until v1.0 stabilizes the contract.
|
|
7
|
+
|
|
8
|
+
## [0.2.0] — 2026-06-26
|
|
9
|
+
|
|
10
|
+
### Packaging
|
|
11
|
+
|
|
12
|
+
- **On PyPI.** `uv tool install khiip` / `pipx install khiip` install the daemon,
|
|
13
|
+
CLI, and MCP server directly — no clone required.
|
|
14
|
+
- **Open-core packaging.** The free AGPL core ships as `khiip`; the paid Khiip Plus
|
|
15
|
+
tier ships as a separate distribution. The published `khiip` sdist + wheel are
|
|
16
|
+
built from an allowlist and carry the core package only.
|
|
17
|
+
- Release automation: tag-driven build → carve guard → PyPI Trusted Publishing
|
|
18
|
+
(OIDC, no API token in the repo). CI now runs the hermetic test suite on every PR.
|
|
19
|
+
|
|
20
|
+
## [0.1.8] — 2026-06-14
|
|
21
|
+
|
|
22
|
+
Captures can now be **re-derived offline from preserved bytes** — improve an
|
|
23
|
+
extractor or a renderer and every old capture upgrades, without re-fetching, and
|
|
24
|
+
even after the source post is deleted. This is the operational core of
|
|
25
|
+
capture-and-keep: the bytes are the durable asset; the typed payload and every
|
|
26
|
+
rendering are recomputable views over them. This release also deepens extraction
|
|
27
|
+
across every in-band source and gives each source a platform-faithful card in
|
|
28
|
+
Obsidian.
|
|
29
|
+
|
|
30
|
+
### Added — substrate
|
|
31
|
+
|
|
32
|
+
- **`refetch --re-extract` (offline L1→L2 replay).** Re-derives the typed
|
|
33
|
+
payload from the **preserved source bytes** with the *current* extractor — no
|
|
34
|
+
network — recovering fields an older extractor missed and surviving source
|
|
35
|
+
deletion. **In place**: same capture id / vault path / `valid_from` (re-parsing
|
|
36
|
+
unchanged bytes is a derived-cache refresh, not a new observation — only the
|
|
37
|
+
network `extraction` dimension supersedes). Re-renders the body and re-embeds
|
|
38
|
+
for recall. New optional `ByteReExtractor` protocol method `extract_from_bytes`
|
|
39
|
+
implemented by all six extractors (web / reddit / x / wiki / youtube / pdf);
|
|
40
|
+
daemon dimension `re-extract`, CLI `--re-extract`, MCP `dimension="re-extract"`.
|
|
41
|
+
Returns a clear 400 when no bytes were preserved (use `extraction`). This
|
|
42
|
+
is the replayability property of the preserved Source-tier bytes.
|
|
43
|
+
- **`refetch --re-render` (offline L2→md replay).** Re-renders the markdown body
|
|
44
|
+
from the existing typed payload — apply an improved renderer or skin to a
|
|
45
|
+
capture (or the whole corpus) with no network, no re-extract, no re-embed. In
|
|
46
|
+
place. Daemon dimension `re-render`, CLI `--re-render`, MCP. Also a tight
|
|
47
|
+
developer loop for iterating renderers without re-fetching.
|
|
48
|
+
- **Richer extraction (Tier-1 completeness lifts).** Fields already fetched but
|
|
49
|
+
never lifted into the typed payload now land in L2 (so MCP/REST agents, recall,
|
|
50
|
+
and renders can use them): YouTube `tags` / `categories` / `video_id` /
|
|
51
|
+
`channel_id` / `channel_url`; Web article `author` (byline); Wikipedia
|
|
52
|
+
`last_modified` / `last_rev_id`. Wikipedia also now fetches and stores
|
|
53
|
+
`categories` (`prop=categories`) at capture. Renderers surface these where
|
|
54
|
+
platform-faithful (YouTube channel link, Web byline, Wikipedia categories +
|
|
55
|
+
"last edited" footer). Existing captures gain them via `--re-extract`.
|
|
56
|
+
- **Tier-2 deeper extraction.** More of each page is now typed into L2:
|
|
57
|
+
- **X** — author verification (`is_verified` / `verified_type` =
|
|
58
|
+
individual/business/government), typed polls (`Poll` / `PollChoice`),
|
|
59
|
+
and note-tweet inline bold; closed the vxtwitter media `alt_text` gap.
|
|
60
|
+
- **Reddit** — superscript + spoiler richtext preserved (`^(…)` / `>!…!<`)
|
|
61
|
+
instead of being flattened by the markdown converter.
|
|
62
|
+
- **Web** — `date_modified` / `date_published` / `publisher` / `breadcrumbs`
|
|
63
|
+
(JSON-LD with OG fallback), plus per-figure **caption + photo credit** (the
|
|
64
|
+
`<figure>`/`<figcaption>` standard + common credit patterns).
|
|
65
|
+
- **Web inline media recovery.** Inline article **images** and embedded **video**
|
|
66
|
+
that the markdown converter previously dropped are now preserved: trafilatura's
|
|
67
|
+
inline images render in document position with their caption/credit, and video
|
|
68
|
+
embeds (YouTube / Vimeo / Dailymotion / HTML5 `<video>`) are kept as a typed
|
|
69
|
+
`Media` entry plus a body link. Every figure is lifted into `WebPayload.media`
|
|
70
|
+
so agents/recall see the full image set. New `WebPayload.site_icon_url` (the
|
|
71
|
+
site favicon). All retroactive via `--re-extract`.
|
|
72
|
+
- **YouTube transcript export.** `GET /api/v1/captures/{id}?format=srt|vtt`
|
|
73
|
+
returns the stored transcript as SubRip or WebVTT — pure local conversion over
|
|
74
|
+
existing cues, no network; 404 for non-YouTube captures / missing transcripts.
|
|
75
|
+
|
|
76
|
+
### Added — surfaces / renderers
|
|
77
|
+
|
|
78
|
+
- **Per-source platform-faithful cards.** X, Reddit, YouTube, Wikipedia, and Web
|
|
79
|
+
captures each render in a per-source coloured card with platform-appropriate
|
|
80
|
+
metadata (X status footer + engagement glyphs + polls; Reddit flair + vote
|
|
81
|
+
pills + comment tree; YouTube channel/stats). The typed payload stays clean —
|
|
82
|
+
the styling is a *derived* Obsidian view (the plugin ships the CSS); in any
|
|
83
|
+
other markdown viewer the card degrades to standard markdown + content.
|
|
84
|
+
- **Web news-masthead.** Web captures now read like a news-article header: a
|
|
85
|
+
`Web · <Type>` banner (News Article / Blog / Review / How-To / Recipe, from
|
|
86
|
+
schema.org), a brand band carrying the site **favicon + name + section**, the
|
|
87
|
+
headline, and an `By <author> · published · updated` byline; the description
|
|
88
|
+
becomes the standfirst dek. Tags render as individual hashtags.
|
|
89
|
+
|
|
90
|
+
### Fixed
|
|
91
|
+
|
|
92
|
+
- **Superseded marker preserved on in-place rewrites.** Re-extract / re-render /
|
|
93
|
+
media / wayback refreshes no longer strip the `superseded_by` banner +
|
|
94
|
+
frontmatter from an older note (it's display-only state; the DB was always
|
|
95
|
+
correct).
|
|
96
|
+
- **Web inline links no longer scrambled.** The article body is now serialized
|
|
97
|
+
via trafilatura HTML → markdownify, fixing displaced/duplicated link text that
|
|
98
|
+
trafilatura's own markdown serializer produced on link-dense articles.
|
|
99
|
+
- **Reddit admin/mod badges detected.** Distinguished status is read from the
|
|
100
|
+
author anchor (`<a class="author admin">`) rather than the comment container,
|
|
101
|
+
so admin/moderator comments are now flagged (previously none were). Backfills
|
|
102
|
+
via `--re-extract`.
|
|
103
|
+
- **Comma-joined tags split correctly.** A `keywords` / `article:tag` value
|
|
104
|
+
emitted as a single `"a,b,c"` string (e.g. The Guardian) now splits into
|
|
105
|
+
separate hashtags instead of collapsing into one long underscore chain.
|
|
106
|
+
|
|
107
|
+
## [0.1.7] — 2026-06-10
|
|
108
|
+
|
|
109
|
+
Captures now carry a per-source identity banner so each source is recognisable
|
|
110
|
+
at a glance in Obsidian.
|
|
111
|
+
|
|
112
|
+
### Added — substrate
|
|
113
|
+
|
|
114
|
+
- **Per-source identity banner.** Each capture's rendered body opens with a
|
|
115
|
+
`> [!khiip-<source>]` callout carrying a label + identifier (e.g.
|
|
116
|
+
`𝕏 · @jack`, `Reddit · r/selfhosted`, `YouTube · 3Blue1Brown`). It's plain
|
|
117
|
+
Markdown — in any viewer it degrades to an ordinary titled callout — so the
|
|
118
|
+
substrate stays destination-agnostic; the *visual* per-source colour + icon
|
|
119
|
+
is supplied by the consumer (the Obsidian plugin ships CSS targeting
|
|
120
|
+
`.callout[data-callout="khiip-<source>"]`). New `renderers/source_banner.py`
|
|
121
|
+
(`compose_source_banner`); prepended in the capture + supersede write paths.
|
|
122
|
+
This is the lightweight in-vault version of the deferred Phase 2.5+
|
|
123
|
+
source-styled renderer.
|
|
124
|
+
|
|
125
|
+
## [0.1.6] — 2026-06-10
|
|
126
|
+
|
|
127
|
+
Refetch versioning is now fork-safe, and recall returns one hit per URL. A live
|
|
128
|
+
Obsidian-plugin smoke surfaced that repeatedly re-fetching the same URL (e.g. a
|
|
129
|
+
"refresh" button, or an MCP agent holding a stale capture id) forked the
|
|
130
|
+
supersession chain into multiple "current" captures for one URL.
|
|
131
|
+
|
|
132
|
+
### Fixed — substrate
|
|
133
|
+
|
|
134
|
+
- **`refetch` (extraction dimension) now resolves to the chain head before
|
|
135
|
+
superseding.** Previously it marked whatever `capture_id` it was handed as
|
|
136
|
+
superseded — so re-fetching an already-superseded version overwrote that
|
|
137
|
+
version's `superseded_by` pointer and orphaned the in-between versions,
|
|
138
|
+
leaving several un-superseded "current" captures for a single URL. It now
|
|
139
|
+
looks up the live un-superseded capture for the URL (`find_capture_by_url_hash`)
|
|
140
|
+
and supersedes that, so refetch always extends a single linear chain
|
|
141
|
+
regardless of which version it was invoked from (append-only supersession).
|
|
142
|
+
|
|
143
|
+
### Changed — substrate
|
|
144
|
+
|
|
145
|
+
- **Recall returns the current head of each supersession chain only** — one hit
|
|
146
|
+
per URL, not every refetched version. Superseded captures are pre-filtered out
|
|
147
|
+
of the ranking candidates (`storage_captures.superseded_capture_ids`) so they
|
|
148
|
+
don't consume top-k slots. Older versions remain on disk + in the DB
|
|
149
|
+
(retrievable by id); they're just out of the default recall surface.
|
|
150
|
+
- **Superseded notes are self-describing in the vault.** When a refetch
|
|
151
|
+
supersedes a capture, the old note is stamped with a `superseded_by`
|
|
152
|
+
frontmatter field + an `[!info] Superseded` banner (reusing the visual-
|
|
153
|
+
indicator callout layer), so an older version reads as old when opened in
|
|
154
|
+
Obsidian. The current head note carries neither — only the DB pointer existed
|
|
155
|
+
before, which was invisible when reading the `.md`.
|
|
156
|
+
|
|
157
|
+
## [0.1.5] — 2026-06-10
|
|
158
|
+
|
|
159
|
+
Wayback archiving is now opt-in and quiet. A real-world end-to-end smoke of the
|
|
160
|
+
non-Reddit extractors found the anonymous Wayback Machine submission — which ran
|
|
161
|
+
on every capture — was both unreliable and slow.
|
|
162
|
+
|
|
163
|
+
### Changed — substrate
|
|
164
|
+
|
|
165
|
+
- **Wayback URL-archiving is now OPT-IN (default off).** archive.org's
|
|
166
|
+
anonymous Save-Page-Now is rate-limited by design (fast HTTP 429s) and
|
|
167
|
+
frequently hangs on the target-crawl for minutes, so unauthenticated
|
|
168
|
+
archiving cannot be relied on. It is no longer presented as a default capture
|
|
169
|
+
feature (don't-over-promise). Enable best-effort anonymous archiving with
|
|
170
|
+
`[archive] wayback_enabled = true` in `config.toml`. Reliable archiving needs
|
|
171
|
+
an authenticated archive.org account; a bring-your-own-credentials (SPN2 /
|
|
172
|
+
S3-key) reliable tier is planned (see roadmap).
|
|
173
|
+
- **Wayback failures are now quiet.** A failed best-effort submission records
|
|
174
|
+
`archive_urls: {wayback: null}` in frontmatter but no
|
|
175
|
+
longer writes a `[!warning] Wayback archive failed` callout into the capture's
|
|
176
|
+
vault body — a tombstone over-stated an optional, known-unreliable witness.
|
|
177
|
+
- **Wayback inline timeout lowered 30s → 15s.** When enabled, a hung anonymous
|
|
178
|
+
submission could block the user-facing capture for the full 30s (a live probe
|
|
179
|
+
measured a successful submission at ~9s and routine multi-minute hangs). 15s
|
|
180
|
+
keeps genuine successes while halving the worst-case wait.
|
|
181
|
+
- **`/health` is honest about Wayback.** The wayback row is absent by default
|
|
182
|
+
(opt-in), and its docstring now states it is a *reachability-only* probe —
|
|
183
|
+
a green row means archive.org is reachable, **not** that archiving will
|
|
184
|
+
succeed (anonymous SPN is rate-limited/best-effort).
|
|
185
|
+
|
|
186
|
+
## [0.1.4] — 2026-06-09
|
|
187
|
+
|
|
188
|
+
Reddit HTML-channel fidelity: the two gaps v0.1.3 documented are now closed,
|
|
189
|
+
both credential-free.
|
|
190
|
+
|
|
191
|
+
### Changed — substrate
|
|
192
|
+
|
|
193
|
+
- **Deep comment trees expand credential-free.** The old.reddit HTML channel
|
|
194
|
+
previously stopped at "load more comments" links and marked the thread
|
|
195
|
+
`comments_truncated`. It now follows them via the same `/api/morechildren`
|
|
196
|
+
endpoint the page's own JS uses (no credentials needed), re-nesting the flat
|
|
197
|
+
response under each comment's true parent — old.reddit flattens deep
|
|
198
|
+
continuations to the bottom of the page, so structural position can't be
|
|
199
|
+
trusted; the real parent comes back in the API response. Expansion is bounded
|
|
200
|
+
by a paced request budget (`max_more_requests`, default 24) to stay under
|
|
201
|
+
Reddit's rate-sensitive WAF; when the budget is spent (or a fetch fails, or a
|
|
202
|
+
link isn't parseable) the residual `more` markers keep `comments_truncated`
|
|
203
|
+
honestly `True`. Best-effort augmentation: any expansion failure leaves the
|
|
204
|
+
base capture intact.
|
|
205
|
+
- **Galleries resolve to distinct full-resolution images.** Gallery media is
|
|
206
|
+
now read from old.reddit's authoritative `.gallery-tile[data-media-id]` list
|
|
207
|
+
(ordered, deduplicated) and mapped to full-res `i.redd.it/<id>.<ext>` URLs.
|
|
208
|
+
Previously the post tile's `preview.redd.it` thumbnails were scraped and
|
|
209
|
+
keyed by full URL, so the same image at several widths produced duplicate
|
|
210
|
+
gallery items pointing at thumbnails. Falls back to a preview-URL scrape
|
|
211
|
+
(deduped by media id) when the structured tiles are absent.
|
|
212
|
+
|
|
213
|
+
Both channels still treat OAuth (when a Reddit app is configured) as the
|
|
214
|
+
fidelity/rate upgrade: 60 req/min headroom for very large threads beyond the
|
|
215
|
+
HTML budget, plus gallery dimensions/captions the HTML tiles don't expose.
|
|
216
|
+
|
|
217
|
+
## [0.1.3] — 2026-06-09
|
|
218
|
+
|
|
219
|
+
Credential-free Reddit capture. Reddit previously required every user to
|
|
220
|
+
register their own OAuth app (`client_id`/`client_secret`) before any Reddit
|
|
221
|
+
URL could be captured — a real adoption friction and the only source that
|
|
222
|
+
reported `degraded` on a fresh install. Reddit's anonymous `.json` API is
|
|
223
|
+
WAF-blocked, but the **old.reddit.com HTML** site still serves live threads +
|
|
224
|
+
full comment trees to a browser-shaped request. v0.1.3 adds that as the
|
|
225
|
+
**default** channel, so Reddit works out of the box; OAuth becomes an optional
|
|
226
|
+
higher-fidelity / higher-rate-limit upgrade.
|
|
227
|
+
|
|
228
|
+
### Changed — substrate
|
|
229
|
+
|
|
230
|
+
- **`RedditExtractor` is now credential-free by default (shape A).** `extract()`
|
|
231
|
+
runs a fallback chain: when a Reddit app is configured, the OAuth-JSON channel
|
|
232
|
+
is tried first (cleaner comment pagination + 60 req/min headroom) and the
|
|
233
|
+
old.reddit-HTML channel backstops it; with no app, capture still works via
|
|
234
|
+
HTML. Both channels emit the same two-Listing shape, so all typed assembly
|
|
235
|
+
(`post_type` / media / `removed_status` / `is_op` / crosspost / comment
|
|
236
|
+
recursion) is shared verbatim. A bad/expired OAuth app now degrades gracefully
|
|
237
|
+
to HTML instead of failing the capture.
|
|
238
|
+
- **`/health` for Reddit is green credential-free.** A missing Reddit app is no
|
|
239
|
+
longer `degraded` — only an unreachable old.reddit is. `fallback_count`
|
|
240
|
+
reflects the wired channels (2 with creds, 1 without).
|
|
241
|
+
|
|
242
|
+
### Added — substrate
|
|
243
|
+
|
|
244
|
+
- **Old.reddit HTML channel** — forces the `old.reddit.com` host + `?limit=500`
|
|
245
|
+
for the deep comment tree, sends the `over18` cookie to clear the NSFW
|
|
246
|
+
interstitial, and paces + retries on the rate-sensitive WAF's transient 403s.
|
|
247
|
+
Raw HTML is preserved to the Source-tier (`.html.gz`). Parses
|
|
248
|
+
post fields, self-text (rendered → markdown), media URLs, the recursive
|
|
249
|
+
comment tree, `removed`/`deleted` status, and truncation. *Known fidelity
|
|
250
|
+
gaps vs the OAuth channel: galleries are best-effort; very deep trees still
|
|
251
|
+
truncate (`comments_truncated`). old.reddit is self-host-only by design — a
|
|
252
|
+
datacenter IP would hit the WAF and `robots.txt`.*
|
|
253
|
+
- **`RedditPayload.extractor_source`** gains `"reddit-html"` so the typed
|
|
254
|
+
payload self-describes which channel produced it.
|
|
255
|
+
|
|
256
|
+
### CLI
|
|
257
|
+
|
|
258
|
+
- **`khiipd capture` / `recall` / `refetch` no longer leak `httpx` tracebacks**
|
|
259
|
+
on a slow/timed-out request — they print a clean message (capture/refetch
|
|
260
|
+
note the daemon may still be finishing server-side) and exit non-zero. Default
|
|
261
|
+
`capture` / `refetch` timeouts raised to 120s so slow sources (YouTube,
|
|
262
|
+
Wikipedia) don't trip the old 30s/60s limits on the happy path.
|
|
263
|
+
|
|
264
|
+
## [0.1.2] — 2026-06-08
|
|
265
|
+
|
|
266
|
+
Dedicated Wikipedia extractor. Wikipedia URLs were previously captured by the
|
|
267
|
+
generic `WebExtractor` (emitting a `WebPayload`); the `WikiPayload` type +
|
|
268
|
+
renderer + embed-composition shipped at v0.1 but no extractor produced one.
|
|
269
|
+
`WikiExtractor` completes that scaffolding by talking to the MediaWiki API
|
|
270
|
+
directly — the one generic-web source with a first-class, versioned API.
|
|
271
|
+
|
|
272
|
+
### Added — substrate
|
|
273
|
+
|
|
274
|
+
- **`khiip.extractors.wiki.WikiExtractor`** — `*.wikipedia.org` `/wiki/<Title>`
|
|
275
|
+
article URLs (language subdomains + `.m.` mobile variants). Registered before
|
|
276
|
+
`WebExtractor` in the default registry so Wikipedia URLs are claimed here
|
|
277
|
+
rather than by the http(s) catch-all. Emits a typed `WikiPayload`
|
|
278
|
+
(`source="wiki"`).
|
|
279
|
+
- **Two-source fallback chain** (the resilience pattern shared across all
|
|
280
|
+
extractors):
|
|
281
|
+
- **MediaWiki action API** (primary) — `action=query&prop=extracts|
|
|
282
|
+
pageimages|info&explaintext=1&exsectionformat=raw` for a clean plain-text
|
|
283
|
+
body parsed into structured `sections` (no fragile HTML scraping for the
|
|
284
|
+
load-bearing fields) + page image + canonical URL; `action=parse&prop=text`
|
|
285
|
+
rendered HTML parsed **best-effort** for `references` + `infobox` (the two
|
|
286
|
+
fields with no clean JSON surface — a parse miss degrades only those).
|
|
287
|
+
- **REST v1 summary** (fallback) — `/api/rest_v1/page/summary/{title}` for a
|
|
288
|
+
thinner lead-only payload via an orthogonal endpoint family.
|
|
289
|
+
- **Source-tier raw-bytes preservation** — the structured query
|
|
290
|
+
JSON is gzipped + retained for future re-derivation.
|
|
291
|
+
- **`health_check()`** probing the REST summary endpoint for the en "Wikipedia"
|
|
292
|
+
article (5s short-timeout), surfaced on `/health` like the other extractors.
|
|
293
|
+
- **CC BY-SA 4.0 attribution** populated on every Wikipedia capture
|
|
294
|
+
(`contributors_attribution`), per Wikipedia's reuse license.
|
|
295
|
+
|
|
296
|
+
### Notes
|
|
297
|
+
|
|
298
|
+
- Foundation-vs-on-top preserved: new module + registration only; the v0.1.1
|
|
299
|
+
substrate (`WikiPayload`, `WikiMarkdownRenderer`, `_compose_wiki`,
|
|
300
|
+
`SourceName`) was already in place and is unchanged. +49 tests.
|
|
301
|
+
|
|
302
|
+
## [0.1.1] — 2026-05-29
|
|
303
|
+
|
|
304
|
+
MCP server. Six tools (`capture_url`, `recall`, `list_captures`,
|
|
305
|
+
`get_capture`, `refetch_capture`, `daemon_status`) over stdio for Claude
|
|
306
|
+
Desktop / Cursor / any MCP-aware client. Thin HTTP proxy architecture:
|
|
307
|
+
the MCP server connects to a running Khiip
|
|
308
|
+
daemon at `127.0.0.1:8478`, reads the Bearer auth token from
|
|
309
|
+
`~/.config/khiip/auth.toml`, and translates MCP tool calls to REST.
|
|
310
|
+
Daemon must be running before tool calls land.
|
|
311
|
+
|
|
312
|
+
### Added — substrate
|
|
313
|
+
|
|
314
|
+
- **`khiip.mcp` subpackage** — `DaemonClient` (httpx-based REST proxy),
|
|
315
|
+
`build_server` (FastMCP-based server factory with six tools registered),
|
|
316
|
+
`__main__` (entry point for the `khiip-mcp-server` console script).
|
|
317
|
+
- **`khiip-mcp-server` console script** wired in `pyproject.toml`. Used by
|
|
318
|
+
Claude Desktop / Cursor MCP configs to launch the stdio server.
|
|
319
|
+
- **`mcp>=1.27,<2` dependency** added to `dependencies` (bundled with the
|
|
320
|
+
daemon — no optional `[mcp]` extra). Aligns with the launch-essay +
|
|
321
|
+
Show HN claim that v0.1 ships REST + MCP together.
|
|
322
|
+
- **Six MCP tools** mapping 1:1 to existing REST endpoints:
|
|
323
|
+
- `capture_url(url, force_new=False)` → `POST /api/v1/captures`
|
|
324
|
+
- `recall(query, limit=10)` → `GET /api/v1/recall`
|
|
325
|
+
- `list_captures(source=None, limit=50, offset=0)` → `GET /api/v1/captures`
|
|
326
|
+
- `get_capture(capture_id, format="json")` → `GET /api/v1/captures/{id}`
|
|
327
|
+
- `refetch_capture(capture_id, dimension="extraction")` → `POST /api/v1/captures/{id}/refetch`
|
|
328
|
+
- `daemon_status()` → `GET /health` + `GET /api/v1/meta` combined
|
|
329
|
+
- **Structured error responses** — daemon-unreachable + HTTP-error
|
|
330
|
+
translation into LLM-readable `{"error": ..., "detail": ...}` dicts so
|
|
331
|
+
agents get actionable failure messages instead of opaque MCP errors.
|
|
332
|
+
- **`KHIIP_DAEMON_URL` env var override** for pointing the MCP server at a
|
|
333
|
+
non-default daemon (useful for sandboxes / dev iterations).
|
|
334
|
+
|
|
335
|
+
### Added — tests
|
|
336
|
+
|
|
337
|
+
- `tests/test_mcp.py` (21 tests; +21 to v0.1.0's 747 baseline = 768 green
|
|
338
|
+
at v0.1.1). Coverage:
|
|
339
|
+
- `DaemonClient` unit tests against `httpx.MockTransport` (auth headers,
|
|
340
|
+
query-param threading, error-translation, env-var override)
|
|
341
|
+
- FastMCP tool tests with an injected fake client (argument threading,
|
|
342
|
+
return-shape envelopes, structured-error translation)
|
|
343
|
+
- Tool-surface lock: asserts exactly 6 tools registered at v0.1.1
|
|
344
|
+
- End-to-end smoke verified by launching `khiip-mcp-server` via the
|
|
345
|
+
official `mcp` Python SDK client; `list_tools` returns all six.
|
|
346
|
+
|
|
347
|
+
### Architectural notes
|
|
348
|
+
|
|
349
|
+
- **REST stays canonical at v0.1.1.** MCP is a transport translator over
|
|
350
|
+
the REST surface. Service-tier refactor (extracting orchestration into
|
|
351
|
+
`khiip.services.*` functions both REST and MCP would call directly) is
|
|
352
|
+
deferred to a forcing function — a hosted/productized surface or
|
|
353
|
+
a third transport. The migration is low-cost: tool signatures, Claude
|
|
354
|
+
Desktop config, and behavior stay stable across the eventual refactor.
|
|
355
|
+
- **Foundation-vs-on-top discipline:** all v0.1.1 changes live in the new
|
|
356
|
+
`khiip.mcp` subpackage. No existing module (extractors, renderers,
|
|
357
|
+
daemon routes, storage, embeddings, wayback) was modified. The v0.1.0
|
|
358
|
+
test suite continues to pass unchanged (747/747).
|
|
359
|
+
|
|
360
|
+
### Stale-context cleanup
|
|
361
|
+
|
|
362
|
+
- **README** — TRUST.md / "8 immutable promises" / "future MCP server"
|
|
363
|
+
references stripped; License section reframed from "Promise 1" to
|
|
364
|
+
LICENSE-grounded framing; status checklist updated for v0.1.1.
|
|
365
|
+
|
|
366
|
+
## [0.1.0] — 2026-05-25
|
|
367
|
+
|
|
368
|
+
First substantive substrate release. Six v0.1 sources (X, Reddit, Web,
|
|
369
|
+
Wikipedia, YouTube, PDF) emit typed Pydantic payloads end-to-end; full
|
|
370
|
+
Renderer Protocol+Registry tier; visual indicator system; Source-tier
|
|
371
|
+
raw-bytes preservation; typed-payload recall composition; vault↔SQLite
|
|
372
|
+
validator; granular refetch CLI; opt-in video preservation.
|
|
373
|
+
|
|
374
|
+
### Added — substrate
|
|
375
|
+
|
|
376
|
+
- **Typed-payload pipeline** — all 6 v0.1 sources emit
|
|
377
|
+
Pydantic-typed payloads at the extractor boundary: `TweetPayload`,
|
|
378
|
+
`RedditPayload`, `WikiPayload`, `WebPayload`, `PDFPayload`,
|
|
379
|
+
`YouTubePayload`, plus cross-platform primitives (`EngagementCounts`,
|
|
380
|
+
`UrlEntity`, `Media`, `CommentNode`) and X-specific `XArticle` /
|
|
381
|
+
`XArticleBlock`. Discriminated union over `kind`.
|
|
382
|
+
- **Renderer Protocol + Registry** — 6 per-source
|
|
383
|
+
`MarkdownRenderer` impls + `LegacyMarkdownPassthroughRenderer` +
|
|
384
|
+
`JSONRenderer` + `VaultFrontmatterRenderer` dispatch first-supporting-
|
|
385
|
+
wins through `MarkdownRendererRegistry`.
|
|
386
|
+
- **Source-tier raw-bytes preservation** (URI-shaped) — original
|
|
387
|
+
upstream bytes gzipped + written under
|
|
388
|
+
user-configurable `data_root` (default `~/.local/share/khiip/sources/
|
|
389
|
+
<source>/<capture_id>.<ext>.gz`); URI-shaped `source_artifact_path`
|
|
390
|
+
generalizes to cloud (`s3://`) / Notion (`notion://`) at v0.5+.
|
|
391
|
+
- **MediaFetcherRegistry** — capability-based dispatch
|
|
392
|
+
across three fetchers: `HttpxFetcher` (photo CDN), `YtDlpFetcher`
|
|
393
|
+
(video via opt-in toggle; HLS/DASH/`v.redd.it`/YouTube),
|
|
394
|
+
`GalleryDlFetcher` (wide-coverage fallback).
|
|
395
|
+
- **Wayback Machine archive submission** (`archive_urls`) —
|
|
396
|
+
best-effort URL archival on every capture; opt-out via
|
|
397
|
+
`[archive] wayback_enabled = false`.
|
|
398
|
+
- **Visual indicator system** (three-layer model): layer 1
|
|
399
|
+
frontmatter-canonical, layer 2 renderer-composed Obsidian
|
|
400
|
+
callouts in vault body. Layer 3 plugin badges
|
|
401
|
+
deferred to v0.5+.
|
|
402
|
+
- **Embed-text composition** — per-source typed-payload
|
|
403
|
+
composition rules for the embedder; URL-fallback for failed-permanent
|
|
404
|
+
tombstones (capture-what-we-can).
|
|
405
|
+
- **Reddit OAuth2 channel** —
|
|
406
|
+
user registers their own Reddit app at https://www.reddit.com/prefs/apps;
|
|
407
|
+
client_id/secret via config.toml or env vars. Khiip-corporate never holds
|
|
408
|
+
upstream credentials.
|
|
409
|
+
- **YouTube fallback chain** —
|
|
410
|
+
yt-dlp → youtube-transcript-api + oEmbed → YouTube Data API v3 (operator
|
|
411
|
+
opt-in via API key); per-branch payload-shape fidelity.
|
|
412
|
+
- **Path-sandbox + defense-in-depth** at security boundaries: vault
|
|
413
|
+
`destination_path` validator, Source-tier
|
|
414
|
+
triple-guarded path validation (source/capture_id/ext rejection of
|
|
415
|
+
`..`/separators/NUL), media-fetcher subprocess argv-list invocation.
|
|
416
|
+
|
|
417
|
+
### Added — surfaces
|
|
418
|
+
|
|
419
|
+
- **`khiipd serve`** — FastAPI daemon (default `127.0.0.1:8478`).
|
|
420
|
+
- **`khiipd capture <url>`** — capture a URL via the running daemon.
|
|
421
|
+
- **`khiipd recall <query>`** — semantic recall against the embedded
|
|
422
|
+
corpus.
|
|
423
|
+
- **`khiipd validate`** *(new in 0.1.0)* — vault ↔ SQLite consistency
|
|
424
|
+
checker. Four invariants: vault ↔ SQLite
|
|
425
|
+
bidirectional reconciliation, structured_payload Pydantic round-trip,
|
|
426
|
+
P-δ status family consistency, media local_path sandbox + existence.
|
|
427
|
+
Exit code 0 clean / 1 violations / 2 missing inputs. `--json` for
|
|
428
|
+
machine-readable output.
|
|
429
|
+
- **`khiipd refetch <id> [--media|--wayback]`** *(new in 0.1.0)* —
|
|
430
|
+
granular refetch. Three dimensions
|
|
431
|
+
independently re-attemptable: extraction (default; re-run extractor;
|
|
432
|
+
new capture; old gets `superseded_by`), media (re-walk
|
|
433
|
+
MediaFetcherRegistry in place), wayback (re-submit + update
|
|
434
|
+
`archive_urls`).
|
|
435
|
+
- **`khiipd auth show|rotate`** — manage the daemon API key.
|
|
436
|
+
- **REST `?format=` selector** on `GET /api/v1/captures/{id}` —
|
|
437
|
+
`capture` (default; full JSON envelope) / `json` (payload only via
|
|
438
|
+
JSONRenderer) / `markdown` (vault file) / `legacy-markdown` (body
|
|
439
|
+
only).
|
|
440
|
+
- **REST `POST /api/v1/captures/{id}/refetch?dimension=...`** *(new in
|
|
441
|
+
0.1.0)* — the daemon-side endpoint the CLI wraps.
|
|
442
|
+
- **REST `?force_new=true`** on `POST /api/v1/captures` — bypass dedup;
|
|
443
|
+
used internally by refetch extraction dimension.
|
|
444
|
+
|
|
445
|
+
### Added — configuration
|
|
446
|
+
|
|
447
|
+
- `[storage] data_root` — user-configurable Source-tier root (default
|
|
448
|
+
XDG `~/.local/share/khiip/`). Points anywhere on a filesystem:
|
|
449
|
+
iCloud Drive / Dropbox / external SSD / network mount for
|
|
450
|
+
cross-machine sync.
|
|
451
|
+
- `[media] download_videos` *(new in 0.1.0)* — opt-in default-off toggle
|
|
452
|
+
for video preservation. When `true`,
|
|
453
|
+
`YtDlpFetcher` registers in the default chain + `GalleryDlFetcher`
|
|
454
|
+
claims `media.type == "video"` (non-streaming-manifest URLs).
|
|
455
|
+
- `[media] fetch_enabled` / `[media] fetcher_disabled` — per-source +
|
|
456
|
+
per-fetcher toggles.
|
|
457
|
+
- `[archive] wayback_enabled` — Wayback opt-out (default true).
|
|
458
|
+
- `[extractors.reddit] client_id` / `client_secret` (or
|
|
459
|
+
`KHIIP_REDDIT_CLIENT_ID` / `KHIIP_REDDIT_CLIENT_SECRET` env vars).
|
|
460
|
+
- `[extractors.youtube] api_key` (or `KHIIP_YOUTUBE_API_KEY` env var).
|
|
461
|
+
|
|
462
|
+
### Changed
|
|
463
|
+
|
|
464
|
+
- **Reddit channel** revised from anonymous `.json` to OAuth2 against
|
|
465
|
+
`oauth.reddit.com`.
|
|
466
|
+
- **Vault frontmatter is canonical** for the Payload tier; SQLite
|
|
467
|
+
`captures` + `embeddings` tables become derived caches rebuildable
|
|
468
|
+
from vault.
|
|
469
|
+
- **`content_sha256`** rebases to hash the embed-text composition
|
|
470
|
+
so cosmetic rendering changes don't trigger re-embedding.
|
|
471
|
+
|
|
472
|
+
### Documented
|
|
473
|
+
|
|
474
|
+
- Foundational architecture decisions recorded: the graph schema, the
|
|
475
|
+
extractor Protocol, append-only edges, the Source/Payload/Render
|
|
476
|
+
three-tier model, and the P-δ failure-handling principle.
|
|
477
|
+
- Substrate-vs-surfaces architectural axis: Khiip = one canonical substrate +
|
|
478
|
+
N surfaces (Obsidian plugin / REST / MCP / Chrome ext / mobile /
|
|
479
|
+
Telegram bot / LLM-agent SDK consumers all speak the unified contract).
|
|
480
|
+
|
|
481
|
+
### Notes
|
|
482
|
+
|
|
483
|
+
- This release lays the substrate baseline. Plugin UI rendering of the
|
|
484
|
+
new typed-payload + visual indicator + refetch surfaces lands at v0.5+
|
|
485
|
+
premium UX scope.
|
|
486
|
+
- Pre-public-launch posture statement for SD-4 video-binary capability is
|
|
487
|
+
separately tracked (1-2 hr authoring; not blocking this tag).
|
|
488
|
+
|
|
489
|
+
[0.2.0]: https://github.com/KhiipAI/khiip/releases/tag/v0.2.0
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Code of Conduct
|
|
2
|
+
|
|
3
|
+
This project adopts the **[Contributor Covenant 2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct/)** as our Code of Conduct.
|
|
4
|
+
|
|
5
|
+
The full text is available at the link above. In short: we expect all contributors and community members to engage respectfully, constructively, and with good faith — and to extend the same to others regardless of background, experience level, or identity.
|
|
6
|
+
|
|
7
|
+
## Enforcement
|
|
8
|
+
|
|
9
|
+
Instances of unacceptable behavior may be reported to the project maintainers at:
|
|
10
|
+
|
|
11
|
+
**Email:** [hello@khiip.com](mailto:hello@khiip.com)
|
|
12
|
+
|
|
13
|
+
All reports will be reviewed and investigated promptly and fairly. The project team is obligated to respect the privacy and security of the reporter of any incident.
|
|
14
|
+
|
|
15
|
+
## Scope
|
|
16
|
+
|
|
17
|
+
This Code of Conduct applies within all community spaces — including but not limited to:
|
|
18
|
+
|
|
19
|
+
- The Khiip GitHub repository (issues, PRs, discussions, code comments)
|
|
20
|
+
- Official Khiip communication channels (email, Discord when launched, etc.)
|
|
21
|
+
- When an individual is officially representing the community in public spaces
|
|
22
|
+
|
|
23
|
+
## Attribution
|
|
24
|
+
|
|
25
|
+
This Code of Conduct adopts the Contributor Covenant, version 2.1, available at https://www.contributor-covenant.org/version/2/1/code_of_conduct/
|
|
26
|
+
|
|
27
|
+
For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
*Khiip Code of Conduct v1.0; last updated 2026-05-17. Adopts Contributor Covenant 2.1 by reference.*
|