siftd 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. siftd-0.1.1/.claude-plugin/marketplace.json +28 -0
  2. siftd-0.1.1/.githooks/pre-commit +2 -0
  3. siftd-0.1.1/.github/workflows/ci.yml +53 -0
  4. siftd-0.1.1/.github/workflows/publish.yml +30 -0
  5. siftd-0.1.1/.gitignore +12 -0
  6. siftd-0.1.1/CHANGELOG.md +96 -0
  7. siftd-0.1.1/CLAUDE.md +69 -0
  8. siftd-0.1.1/LICENSE +21 -0
  9. siftd-0.1.1/PKG-INFO +88 -0
  10. siftd-0.1.1/README.md +57 -0
  11. siftd-0.1.1/bench/build.py +122 -0
  12. siftd-0.1.1/bench/corpus_analysis.py +233 -0
  13. siftd-0.1.1/bench/queries.json +114 -0
  14. siftd-0.1.1/bench/run.py +569 -0
  15. siftd-0.1.1/bench/strategies/exchange-window.json +11 -0
  16. siftd-0.1.1/bench/view.py +251 -0
  17. siftd-0.1.1/docs/cli.md +449 -0
  18. siftd-0.1.1/plugin/.claude-plugin/plugin.json +15 -0
  19. siftd-0.1.1/plugin/README.md +89 -0
  20. siftd-0.1.1/plugin/hooks/hooks.json +36 -0
  21. siftd-0.1.1/plugin/scripts/session-start.sh +29 -0
  22. siftd-0.1.1/plugin/scripts/skill-reminder.sh +18 -0
  23. siftd-0.1.1/plugin/scripts/skill-required.sh +19 -0
  24. siftd-0.1.1/plugin/skills/siftd/SKILL.md +258 -0
  25. siftd-0.1.1/plugin/skills/siftd/reference/ask.md +223 -0
  26. siftd-0.1.1/plugin/skills/siftd/reference/query.md +177 -0
  27. siftd-0.1.1/plugin/skills/siftd/reference/tags.md +109 -0
  28. siftd-0.1.1/plugin/skills/siftd-api/SKILL.md +202 -0
  29. siftd-0.1.1/pyproject.toml +69 -0
  30. siftd-0.1.1/scripts/examples/bulk-tag.py +113 -0
  31. siftd-0.1.1/scripts/examples/find-decisions.py +113 -0
  32. siftd-0.1.1/scripts/examples/weekly-cost.sql +27 -0
  33. siftd-0.1.1/scripts/gen-cli-docs.sh +40 -0
  34. siftd-0.1.1/scripts/lint.sh +24 -0
  35. siftd-0.1.1/src/siftd/__init__.py +54 -0
  36. siftd-0.1.1/src/siftd/adapters/__init__.py +5 -0
  37. siftd-0.1.1/src/siftd/adapters/_jsonl.py +33 -0
  38. siftd-0.1.1/src/siftd/adapters/aider.py +305 -0
  39. siftd-0.1.1/src/siftd/adapters/claude_code.py +259 -0
  40. siftd-0.1.1/src/siftd/adapters/codex_cli.py +268 -0
  41. siftd-0.1.1/src/siftd/adapters/gemini_cli.py +260 -0
  42. siftd-0.1.1/src/siftd/adapters/registry.py +183 -0
  43. siftd-0.1.1/src/siftd/api/__init__.py +152 -0
  44. siftd-0.1.1/src/siftd/api/adapters.py +127 -0
  45. siftd-0.1.1/src/siftd/api/conversations.py +647 -0
  46. siftd-0.1.1/src/siftd/api/doctor.py +19 -0
  47. siftd-0.1.1/src/siftd/api/export.py +313 -0
  48. siftd-0.1.1/src/siftd/api/file_refs.py +111 -0
  49. siftd-0.1.1/src/siftd/api/peek.py +21 -0
  50. siftd-0.1.1/src/siftd/api/resources.py +132 -0
  51. siftd-0.1.1/src/siftd/api/search.py +173 -0
  52. siftd-0.1.1/src/siftd/api/stats.py +155 -0
  53. siftd-0.1.1/src/siftd/api/tools.py +130 -0
  54. siftd-0.1.1/src/siftd/backfill.py +285 -0
  55. siftd-0.1.1/src/siftd/builtin_queries/__init__.py +7 -0
  56. siftd-0.1.1/src/siftd/builtin_queries/cost.sql +27 -0
  57. siftd-0.1.1/src/siftd/builtin_queries/shell-analysis.sql +104 -0
  58. siftd-0.1.1/src/siftd/cli.py +1516 -0
  59. siftd-0.1.1/src/siftd/cli_ask.py +370 -0
  60. siftd-0.1.1/src/siftd/cli_install.py +216 -0
  61. siftd-0.1.1/src/siftd/config.py +101 -0
  62. siftd-0.1.1/src/siftd/doctor/__init__.py +21 -0
  63. siftd-0.1.1/src/siftd/doctor/checks.py +637 -0
  64. siftd-0.1.1/src/siftd/doctor/runner.py +130 -0
  65. siftd-0.1.1/src/siftd/domain/__init__.py +25 -0
  66. siftd-0.1.1/src/siftd/domain/models.py +77 -0
  67. siftd-0.1.1/src/siftd/domain/shell_categories.py +120 -0
  68. siftd-0.1.1/src/siftd/domain/source.py +22 -0
  69. siftd-0.1.1/src/siftd/embeddings/__init__.py +37 -0
  70. siftd-0.1.1/src/siftd/embeddings/availability.py +56 -0
  71. siftd-0.1.1/src/siftd/embeddings/base.py +68 -0
  72. siftd-0.1.1/src/siftd/embeddings/chunker.py +274 -0
  73. siftd-0.1.1/src/siftd/embeddings/fastembed_backend.py +41 -0
  74. siftd-0.1.1/src/siftd/embeddings/indexer.py +159 -0
  75. siftd-0.1.1/src/siftd/embeddings/ollama_backend.py +83 -0
  76. siftd-0.1.1/src/siftd/ids.py +38 -0
  77. siftd-0.1.1/src/siftd/ingestion/__init__.py +12 -0
  78. siftd-0.1.1/src/siftd/ingestion/discovery.py +23 -0
  79. siftd-0.1.1/src/siftd/ingestion/orchestration.py +384 -0
  80. siftd-0.1.1/src/siftd/math.py +11 -0
  81. siftd-0.1.1/src/siftd/models.py +133 -0
  82. siftd-0.1.1/src/siftd/output/__init__.py +45 -0
  83. siftd-0.1.1/src/siftd/output/formatters.py +680 -0
  84. siftd-0.1.1/src/siftd/output/registry.py +170 -0
  85. siftd-0.1.1/src/siftd/paths.py +75 -0
  86. siftd-0.1.1/src/siftd/peek/__init__.py +23 -0
  87. siftd-0.1.1/src/siftd/peek/reader.py +214 -0
  88. siftd-0.1.1/src/siftd/peek/scanner.py +142 -0
  89. siftd-0.1.1/src/siftd/search.py +398 -0
  90. siftd-0.1.1/src/siftd/storage/__init__.py +43 -0
  91. siftd-0.1.1/src/siftd/storage/embeddings.py +245 -0
  92. siftd-0.1.1/src/siftd/storage/filters.py +94 -0
  93. siftd-0.1.1/src/siftd/storage/fts.py +157 -0
  94. siftd-0.1.1/src/siftd/storage/queries.py +139 -0
  95. siftd-0.1.1/src/siftd/storage/schema.sql +287 -0
  96. siftd-0.1.1/src/siftd/storage/sqlite.py +991 -0
  97. siftd-0.1.1/src/siftd/storage/tags.py +255 -0
  98. siftd-0.1.1/tests/conftest.py +303 -0
  99. siftd-0.1.1/tests/fixtures/.aider.chat.history.md +49 -0
  100. siftd-0.1.1/tests/fixtures/claude_code_minimal.jsonl +4 -0
  101. siftd-0.1.1/tests/fixtures/codex_cli_minimal.jsonl +6 -0
  102. siftd-0.1.1/tests/fixtures/gemini_cli_minimal.json +35 -0
  103. siftd-0.1.1/tests/test_adapters.py +378 -0
  104. siftd-0.1.1/tests/test_api.py +431 -0
  105. siftd-0.1.1/tests/test_chunker.py +70 -0
  106. siftd-0.1.1/tests/test_cli.py +90 -0
  107. siftd-0.1.1/tests/test_config.py +186 -0
  108. siftd-0.1.1/tests/test_derivative.py +249 -0
  109. siftd-0.1.1/tests/test_doctor.py +384 -0
  110. siftd-0.1.1/tests/test_embeddings_availability.py +197 -0
  111. siftd-0.1.1/tests/test_embeddings_storage.py +151 -0
  112. siftd-0.1.1/tests/test_exclude_active.py +81 -0
  113. siftd-0.1.1/tests/test_export.py +323 -0
  114. siftd-0.1.1/tests/test_formatters.py +328 -0
  115. siftd-0.1.1/tests/test_ingestion.py +357 -0
  116. siftd-0.1.1/tests/test_integration.py +364 -0
  117. siftd-0.1.1/tests/test_mmr.py +189 -0
  118. siftd-0.1.1/tests/test_models.py +48 -0
  119. siftd-0.1.1/tests/test_peek.py +376 -0
  120. siftd-0.1.1/tests/test_query_files.py +238 -0
  121. siftd-0.1.1/tests/test_shell_categorization.py +100 -0
  122. siftd-0.1.1/uv.lock +898 -0
@@ -0,0 +1,28 @@
1
+ {
2
+ "name": "siftd",
3
+ "owner": {
4
+ "name": "kaygee",
5
+ "url": "https://github.com/kgruel"
6
+ },
7
+ "metadata": {
8
+ "description": "Siftd plugin marketplace — conversation research for Claude Code agents",
9
+ "version": "1.0.0"
10
+ },
11
+ "plugins": [
12
+ {
13
+ "name": "siftd",
14
+ "source": "./plugin",
15
+ "description": "Search and research past conversations from CLI coding sessions",
16
+ "version": "1.0.0",
17
+ "author": {
18
+ "name": "kaygee",
19
+ "url": "https://github.com/kgruel"
20
+ },
21
+ "repository": "https://github.com/kgruel/siftd",
22
+ "license": "MIT",
23
+ "keywords": ["conversations", "search", "research", "history", "analytics"],
24
+ "category": "productivity",
25
+ "tags": ["research", "history", "memory"]
26
+ }
27
+ ]
28
+ }
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash
2
+ exec "$(dirname "$0")/../scripts/lint.sh"
@@ -0,0 +1,53 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v4
21
+ with:
22
+ version: "latest"
23
+
24
+ - name: Set up Python ${{ matrix.python-version }}
25
+ run: uv python install ${{ matrix.python-version }}
26
+
27
+ - name: Install dependencies
28
+ run: uv sync --extra dev
29
+
30
+ - name: Run lint
31
+ run: uv run ruff check src/
32
+
33
+ - name: Run tests
34
+ run: uv run pytest tests/ -v --tb=short
35
+
36
+ test-with-embeddings:
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+
41
+ - name: Install uv
42
+ uses: astral-sh/setup-uv@v4
43
+ with:
44
+ version: "latest"
45
+
46
+ - name: Set up Python
47
+ run: uv python install 3.12
48
+
49
+ - name: Install dependencies with embeddings
50
+ run: uv sync --extra dev --extra embed
51
+
52
+ - name: Run tests with embeddings
53
+ run: uv run pytest tests/ -v --tb=short
@@ -0,0 +1,30 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ environment: pypi
12
+ permissions:
13
+ id-token: write # Required for trusted publishing
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v4
20
+ with:
21
+ version: "latest"
22
+
23
+ - name: Set up Python
24
+ run: uv python install 3.12
25
+
26
+ - name: Build package
27
+ run: uv build
28
+
29
+ - name: Publish to PyPI
30
+ uses: pypa/gh-action-pypi-publish@release/v1
siftd-0.1.1/.gitignore ADDED
@@ -0,0 +1,12 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .subtask/
4
+ bench/runs/
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ .obsidian/
8
+ .venv/
9
+ dist/
10
+ .claude/
11
+ HANDOFF.md
12
+ ROADMAP.md
@@ -0,0 +1,96 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.1] - 2026-01-29
11
+
12
+ ### Added
13
+
14
+ - `siftd install embed` — Convenience command to auto-detect installation method and install embedding dependencies
15
+ - `:var` parameterized syntax for query files — safe quoting via sqlite3, alongside existing `$var` text substitution
16
+ - `ADAPTER_INTERFACE_VERSION = 1` — Required attribute for all adapters, enables future interface migrations
17
+ - `ON DELETE CASCADE` on schema foreign keys — Child records now cascade on parent delete
18
+
19
+ ### Changed
20
+
21
+ - Adapter `discover()` function now requires `locations` keyword argument (fallback removed)
22
+ - Error messages for missing `[embed]` extra now reference `siftd install embed` and suggest FTS5 alternative
23
+
24
+ ### Removed
25
+
26
+ - `Conversation.default_model` field — Was defined but never populated or used
27
+
28
+ ### Fixed
29
+
30
+ - Type checker (`ty`) configuration for optional dependencies — No longer blocks commits
31
+ - `bench/corpus_analysis.py` type annotation bug
32
+
33
+ ## [0.1.0] - 2026-01-28
34
+
35
+ Initial public release.
36
+
37
+ ### Added
38
+
39
+ #### Core Features
40
+ - **Ingestion** — Aggregate conversation logs from multiple CLI coding tools
41
+ - **FTS5 Search** — Full-text search across all conversations via `siftd query -s`
42
+ - **Semantic Search** — Vector similarity search via `siftd ask` (requires `[embed]` extra)
43
+ - **Tagging** — Apply tags to conversations, workspaces, and tool calls for organization
44
+
45
+ #### Adapters
46
+ - Claude Code (Anthropic) — `~/.claude/projects`
47
+ - Aider — `~/.aider`
48
+ - Gemini CLI (Google) — `~/.gemini/tmp`
49
+ - Codex CLI (OpenAI) — `~/.codex/sessions`
50
+ - Drop-in adapter support via `~/.config/siftd/adapters/`
51
+ - Entry-point adapter registration for pip-installable adapters
52
+
53
+ #### CLI Commands
54
+ - `siftd ingest` — Ingest logs from all discovered sources
55
+ - `siftd status` — Show database statistics
56
+ - `siftd query` — List/filter conversations with flexible filters
57
+ - `siftd ask` — Semantic search over conversations (optional `[embed]` extra)
58
+ - `siftd tag` — Apply or remove tags on entities
59
+ - `siftd tags` — List, rename, or delete tags
60
+ - `siftd tools` — Summarize tool usage by category
61
+ - `siftd export` — Export conversations for PR review workflows
62
+ - `siftd doctor` — Run health checks and maintenance
63
+ - `siftd peek` — Inspect live sessions from disk (bypasses SQLite)
64
+ - `siftd path` — Show XDG paths
65
+ - `siftd config` — View or modify configuration
66
+ - `siftd adapters` — List discovered adapters
67
+ - `siftd copy` — Copy built-in resources for customization
68
+ - `siftd backfill` — Backfill derived data from existing records
69
+
70
+ #### Query System
71
+ - User-defined SQL queries via `~/.config/siftd/queries/*.sql`
72
+ - `$var` syntax for text substitution
73
+ - Built-in queries: `cost.sql`, `shell-analysis.sql`
74
+
75
+ #### Python API
76
+ - `siftd.api.list_conversations()` — Query conversations with filters
77
+ - `siftd.api.get_conversation()` — Get full conversation detail
78
+ - `siftd.api.export_conversations()` — Export for external tools
79
+ - `siftd.api.hybrid_search()` — Combined FTS5 + semantic search
80
+
81
+ #### Storage
82
+ - SQLite with FTS5 for full-text search
83
+ - ULID primary keys throughout
84
+ - Normalized schema with proper foreign key constraints
85
+ - Extensible `*_attributes` tables for variable metadata
86
+
87
+ #### Developer Experience
88
+ - XDG Base Directory compliance for paths
89
+ - `--db PATH` override for all commands
90
+ - JSON output mode for scripting (`--json`)
91
+
92
+ ---
93
+
94
+ [Unreleased]: https://github.com/anthropics/siftd/compare/v0.1.1...HEAD
95
+ [0.1.1]: https://github.com/anthropics/siftd/compare/v0.1.0...v0.1.1
96
+ [0.1.0]: https://github.com/anthropics/siftd/releases/tag/v0.1.0
siftd-0.1.1/CLAUDE.md ADDED
@@ -0,0 +1,69 @@
1
+ Personal LLM usage analytics. Ingests conversation logs from CLI coding tools, stores in SQLite, queries via FTS5 and user-defined SQL files.
2
+
3
+ ## Install
4
+
5
+ ```bash
6
+ uv pip install . # core (FTS5 search, tags, queries)
7
+ uv pip install .[embed] # with semantic search (siftd ask)
8
+ ```
9
+
10
+ ## Architecture
11
+
12
+ Core loop: **Ingest → Store → Query**
13
+
14
+ - **Adapters** own parsing and raw format knowledge. Storage is adapter-agnostic.
15
+ - **Storage** is normalized SQLite. Schema is fixed for core entities, extensible via `*_attributes` tables.
16
+ - **Queries** are user-defined `.sql` files with `$var` substitution. The system is a data platform, not a reporting tool.
17
+
18
+ ## Design Principles
19
+
20
+ 1. **Manual first, automate when patterns emerge** — labels are user-applied, enrichment is deferred, cost is approximate. Don't build automation until real usage reveals what's worth automating.
21
+ 2. **Query-time computation over stored redundancy** — cost is derived via JOIN, not pre-computed. Avoids stale data and schema coupling.
22
+ 3. **Attributes for variable metadata** — when the field set varies by provider or adapter, use key/value `*_attributes` tables instead of adding nullable columns.
23
+ 4. **Adapters are the parsing boundary** — each adapter knows its raw format, dedup strategy, and provider source. Everything downstream is normalized.
24
+ 5. **Approximate is fine when labeled** — approximate cost is useful. Don't over-engineer precision until billing context demands it.
25
+
26
+ ## Branching
27
+
28
+ - Work in a `wip/<topic>` branch during sessions. Handoff updates, test scaffolding, and iterative commits go here.
29
+ - Subtask merges target `main` independently (they're isolated worktrees).
30
+ - Merge/rebase WIP to main when the work is ready to ship.
31
+
32
+ ## Conventions
33
+
34
+ - `commit=False` default on storage functions; caller controls transaction boundaries
35
+ - ULIDs for all primary keys
36
+ - XDG paths: data `~/.local/share/siftd`, config `~/.config/siftd`
37
+ - New CLI commands follow existing argparse patterns in `src/cli.py`
38
+ - New adapters implement `can_handle(source)`, `parse(source)`, `discover()`, set `HARNESS_SOURCE`
39
+ - Queries go in `~/.config/siftd/queries/*.sql`, use `$var` for parameters
40
+
41
+ ## Agent Memory (siftd)
42
+
43
+ Search past conversations:
44
+ ```
45
+ siftd ask "your query" # semantic search
46
+ siftd ask -w projectname "query" # filter by workspace
47
+ siftd query <id> # drill down into conversation
48
+ ```
49
+
50
+ Tag useful findings:
51
+ ```
52
+ siftd tag <id> research:<topic> # bookmark for later
53
+ siftd query -l research:<topic> # retrieve tagged
54
+ ```
55
+
56
+ Tag conventions:
57
+ - `research:*` — Investigation findings worth preserving
58
+ - `useful:*` — General bookmarks (useful:pattern, useful:example)
59
+ - `decision:*` — Key architectural/design decisions
60
+ - `handoff:update` — Sessions that modified HANDOFF.md
61
+ - `rationale:*` — Why we chose X over Y
62
+ - `genesis:*` — First mention of a concept
63
+
64
+ When you find something useful via `siftd ask`, tag it before moving on.
65
+
66
+ Before ending a session that updates HANDOFF.md:
67
+ ```
68
+ siftd tag --last handoff:update
69
+ ```
siftd-0.1.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 siftd contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
siftd-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.4
2
+ Name: siftd
3
+ Version: 0.1.1
4
+ Summary: Personal LLM usage analytics. Ingest conversation logs from CLI coding tools, query via FTS5 and semantic search.
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Keywords: analytics,claude,conversation,llm,search
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Topic :: Software Development :: Documentation
14
+ Requires-Python: >=3.12
15
+ Requires-Dist: httpx
16
+ Requires-Dist: loguru
17
+ Requires-Dist: pyyaml
18
+ Requires-Dist: tomlkit
19
+ Requires-Dist: tqdm
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest>=8.0; extra == 'dev'
22
+ Requires-Dist: ruff>=0.4; extra == 'dev'
23
+ Requires-Dist: ty; extra == 'dev'
24
+ Provides-Extra: embed
25
+ Requires-Dist: fastembed; extra == 'embed'
26
+ Requires-Dist: huggingface-hub; extra == 'embed'
27
+ Requires-Dist: numpy; extra == 'embed'
28
+ Requires-Dist: onnxruntime; extra == 'embed'
29
+ Requires-Dist: tokenizers; extra == 'embed'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # siftd
33
+
34
+ Ingest and query conversation logs from LLM coding tools. Stores in SQLite, searches via FTS5 and embeddings.
35
+
36
+ Warning: This project is under active development and breaking changes may occur.
37
+
38
+ ## Install
39
+
40
+ ```bash
41
+ pip install siftd # core (query, tags, ingest)
42
+ pip install siftd[embed] # with semantic search
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ ```bash
48
+ # Ingest logs from Claude Code, Gemini CLI, Codex, Aider
49
+ siftd ingest
50
+
51
+ # List recent conversations
52
+ siftd query -w . # current workspace
53
+ siftd query --since 7d # last week
54
+
55
+ # Semantic search (requires [embed])
56
+ siftd ask "how did I handle auth"
57
+ siftd ask -w myproject "error handling"
58
+
59
+ # Tag and filter
60
+ siftd tag 01JGK3 decision:auth
61
+ siftd query -l decision:
62
+ ```
63
+
64
+ ## Supported Tools
65
+
66
+ - Claude Code
67
+ - Gemini CLI
68
+ - Codex CLI
69
+ - Aider
70
+
71
+ ## Commands
72
+
73
+ | Command | Description |
74
+ |---------|-------------|
75
+ | `ingest` | Import conversation logs |
76
+ | `query` | List/filter conversations |
77
+ | `ask` | Semantic search |
78
+ | `tag` | Apply tags to conversations |
79
+ | `peek` | View conversation contents |
80
+ | `doctor` | Check configuration |
81
+
82
+ ## Documentation
83
+
84
+ - [CLI Reference](docs/cli.md)
85
+
86
+ ## License
87
+
88
+ MIT
siftd-0.1.1/README.md ADDED
@@ -0,0 +1,57 @@
1
+ # siftd
2
+
3
+ Ingest and query conversation logs from LLM coding tools. Stores in SQLite, searches via FTS5 and embeddings.
4
+
5
+ Warning: This project is under active development and breaking changes may occur.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install siftd # core (query, tags, ingest)
11
+ pip install siftd[embed] # with semantic search
12
+ ```
13
+
14
+ ## Usage
15
+
16
+ ```bash
17
+ # Ingest logs from Claude Code, Gemini CLI, Codex, Aider
18
+ siftd ingest
19
+
20
+ # List recent conversations
21
+ siftd query -w . # current workspace
22
+ siftd query --since 7d # last week
23
+
24
+ # Semantic search (requires [embed])
25
+ siftd ask "how did I handle auth"
26
+ siftd ask -w myproject "error handling"
27
+
28
+ # Tag and filter
29
+ siftd tag 01JGK3 decision:auth
30
+ siftd query -l decision:
31
+ ```
32
+
33
+ ## Supported Tools
34
+
35
+ - Claude Code
36
+ - Gemini CLI
37
+ - Codex CLI
38
+ - Aider
39
+
40
+ ## Commands
41
+
42
+ | Command | Description |
43
+ |---------|-------------|
44
+ | `ingest` | Import conversation logs |
45
+ | `query` | List/filter conversations |
46
+ | `ask` | Semantic search |
47
+ | `tag` | Apply tags to conversations |
48
+ | `peek` | View conversation contents |
49
+ | `doctor` | Check configuration |
50
+
51
+ ## Documentation
52
+
53
+ - [CLI Reference](docs/cli.md)
54
+
55
+ ## License
56
+
57
+ MIT
@@ -0,0 +1,122 @@
1
+ """Build an embeddings database from a strategy file.
2
+
3
+ Usage:
4
+ python bench/build.py --strategy bench/strategies/exchange-window.json
5
+ python bench/build.py --strategy bench/strategies/exchange-window.json --output /tmp/test.db
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import sqlite3
11
+ import sys
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+
15
+ # bench/ is not a package — add src/ to path so siftd imports work
16
+ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
17
+
18
+ from siftd.embeddings.fastembed_backend import FastEmbedBackend
19
+ from siftd.paths import data_dir
20
+ from siftd.storage.embeddings import open_embeddings_db, store_chunk, set_meta
21
+
22
+
23
+ def extract_chunks(main_conn: sqlite3.Connection, params: dict) -> list[dict]:
24
+ """Extract chunks from main DB using the exchange-window chunker."""
25
+ from fastembed import TextEmbedding
26
+ from siftd.embeddings.chunker import extract_exchange_window_chunks
27
+
28
+ target_tokens = params.get("target_tokens", 256)
29
+ max_tokens = params.get("max_tokens", 512)
30
+ overlap_tokens = params.get("overlap_tokens", 25)
31
+
32
+ emb = TextEmbedding("BAAI/bge-small-en-v1.5")
33
+ tokenizer = emb.model.tokenizer
34
+
35
+ return extract_exchange_window_chunks(
36
+ main_conn,
37
+ tokenizer,
38
+ target_tokens=target_tokens,
39
+ max_tokens=max_tokens,
40
+ overlap_tokens=overlap_tokens,
41
+ )
42
+
43
+
44
+ def build(strategy_path: Path, output_path: Path, db_path: Path) -> None:
45
+ """Build embeddings DB from strategy."""
46
+ strategy = json.loads(strategy_path.read_text())
47
+ params = strategy["params"]
48
+
49
+ # Extract chunks from main DB
50
+ main_conn = sqlite3.connect(db_path)
51
+ chunks = extract_chunks(main_conn, params)
52
+ main_conn.close()
53
+
54
+ if not chunks:
55
+ print("No chunks extracted. Check strategy params and main DB.")
56
+ return
57
+
58
+ print(f"Extracted {len(chunks)} chunks")
59
+
60
+ # Embed in batches
61
+ backend = FastEmbedBackend()
62
+ batch_size = 64
63
+ all_embeddings: list[list[float]] = []
64
+
65
+ for i in range(0, len(chunks), batch_size):
66
+ batch_texts = [c["text"] for c in chunks[i : i + batch_size]]
67
+ batch_embeddings = backend.embed(batch_texts)
68
+ all_embeddings.extend(batch_embeddings)
69
+ print(f" Embedded batch {i // batch_size + 1}/{(len(chunks) + batch_size - 1) // batch_size}")
70
+
71
+ # Store in embeddings DB
72
+ embed_conn = open_embeddings_db(output_path)
73
+ set_meta(embed_conn, "backend", backend.model)
74
+ set_meta(embed_conn, "dimension", str(backend.dimension))
75
+
76
+ for chunk, embedding in zip(chunks, all_embeddings):
77
+ store_chunk(
78
+ embed_conn,
79
+ chunk["conversation_id"],
80
+ chunk["chunk_type"],
81
+ chunk["text"],
82
+ embedding,
83
+ token_count=chunk.get("token_count"),
84
+ )
85
+
86
+ embed_conn.commit()
87
+ embed_conn.close()
88
+
89
+ print(f"Built {len(chunks)} chunks → {output_path}")
90
+
91
+
92
+ def main():
93
+ parser = argparse.ArgumentParser(description="Build embeddings DB from a strategy file")
94
+ parser.add_argument("--strategy", type=Path, required=True, help="Path to strategy JSON file")
95
+ parser.add_argument("--output", type=Path, default=None, help="Output embeddings DB path")
96
+ parser.add_argument("--db", type=Path, default=None, help="Path to main siftd.db")
97
+ args = parser.parse_args()
98
+
99
+ if not args.strategy.exists():
100
+ print(f"Strategy file not found: {args.strategy}")
101
+ sys.exit(1)
102
+
103
+ # Resolve main DB path
104
+ db = args.db or (data_dir() / "siftd.db")
105
+ if not db.exists():
106
+ print(f"Main DB not found: {db}")
107
+ sys.exit(1)
108
+
109
+ # Resolve output path
110
+ if args.output:
111
+ output = args.output
112
+ else:
113
+ strategy = json.loads(args.strategy.read_text())
114
+ name = strategy.get("name", args.strategy.stem)
115
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
116
+ output = data_dir() / f"embeddings_{name}_{timestamp}.db"
117
+
118
+ build(args.strategy, output, db)
119
+
120
+
121
+ if __name__ == "__main__":
122
+ main()