code-data-ark 2.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. code_data_ark-2.0.2/.flake8 +19 -0
  2. code_data_ark-2.0.2/.github/workflows/ci.yml +63 -0
  3. code_data_ark-2.0.2/.gitignore +27 -0
  4. code_data_ark-2.0.2/PKG-INFO +495 -0
  5. code_data_ark-2.0.2/bin/release.py +87 -0
  6. code_data_ark-2.0.2/cda/__init__.py +3 -0
  7. code_data_ark-2.0.2/cda/kernel/__init__.py +0 -0
  8. code_data_ark-2.0.2/cda/kernel/control_db.py +151 -0
  9. code_data_ark-2.0.2/cda/kernel/pmf_kernel.py +364 -0
  10. code_data_ark-2.0.2/cda/kernel/selfcheck.py +299 -0
  11. code_data_ark-2.0.2/cda/pipeline/__init__.py +0 -0
  12. code_data_ark-2.0.2/cda/pipeline/embed.py +694 -0
  13. code_data_ark-2.0.2/cda/pipeline/extract.py +1064 -0
  14. code_data_ark-2.0.2/cda/pipeline/ingest.py +673 -0
  15. code_data_ark-2.0.2/cda/pipeline/parse_edits.py +250 -0
  16. code_data_ark-2.0.2/cda/pipeline/reconstruct.py +536 -0
  17. code_data_ark-2.0.2/cda/pipeline/watcher.py +783 -0
  18. code_data_ark-2.0.2/cda/ui/__init__.py +0 -0
  19. code_data_ark-2.0.2/cda/ui/cli.py +2587 -0
  20. code_data_ark-2.0.2/cda/ui/web.py +2848 -0
  21. code_data_ark-2.0.2/changelog.md +142 -0
  22. code_data_ark-2.0.2/contributing.md +297 -0
  23. code_data_ark-2.0.2/docs/architecture.md +503 -0
  24. code_data_ark-2.0.2/docs/examples/usage.md +418 -0
  25. code_data_ark-2.0.2/docs/pmf_kernel.md +98 -0
  26. code_data_ark-2.0.2/docs/roadmap.md +82 -0
  27. code_data_ark-2.0.2/license +21 -0
  28. code_data_ark-2.0.2/makefile +58 -0
  29. code_data_ark-2.0.2/pyproject.toml +104 -0
  30. code_data_ark-2.0.2/readme.md +452 -0
  31. code_data_ark-2.0.2/tests/test_basic.py +115 -0
  32. code_data_ark-2.0.2/tests/test_selfcheck.py +352 -0
  33. code_data_ark-2.0.2/version +1 -0
@@ -0,0 +1,19 @@
1
+ [flake8]
2
+ max-line-length = 150
3
+ extend-ignore =
4
+ # Intentional aligned assignments (project style)
5
+ E221,
6
+ E272,
7
+ # Continuation line indent (black manages this)
8
+ E128,
9
+ # Inline comment spacing (minor)
10
+ E261,
11
+ # Multiple statements on one line (minor)
12
+ E701,
13
+ # Missing whitespace around operator in annotations
14
+ E225,
15
+ # Line break before binary operator (black preference)
16
+ W503,
17
+ per-file-ignores =
18
+ # Conditional import at bottom of watcher.py main guard
19
+ cda/pipeline/watcher.py:E402
@@ -0,0 +1,63 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, master ]
6
+ pull_request:
7
+ branches: [ main, master ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - name: Set up Python ${{ matrix.python-version }}
19
+ uses: actions/setup-python@v4
20
+ with:
21
+ python-version: ${{ matrix.python-version }}
22
+ - name: Install dependencies
23
+ run: |
24
+ python -m pip install --upgrade pip
25
+ pip install -e ".[dev]"
26
+ - name: Lint with flake8
27
+ run: |
28
+ flake8 cda tests
29
+ - name: Type check with mypy
30
+ run: |
31
+ mypy cda
32
+ - name: Test with pytest
33
+ run: |
34
+ pytest tests/ -v --cov=cda --cov-report=xml
35
+ - name: Upload coverage to Codecov
36
+ uses: codecov/codecov-action@v3
37
+ with:
38
+ file: coverage.xml
39
+ flags: unittests
40
+ name: codecov-umbrella
41
+ fail_ci_if_error: false
42
+
43
+ build:
44
+ runs-on: ubuntu-latest
45
+ needs: test
46
+
47
+ steps:
48
+ - uses: actions/checkout@v4
49
+ - name: Set up Python
50
+ uses: actions/setup-python@v4
51
+ with:
52
+ python-version: "3.11"
53
+ - name: Install build dependencies
54
+ run: |
55
+ python -m pip install --upgrade pip
56
+ pip install build
57
+ - name: Build package
58
+ run: python -m build
59
+ - name: Store build artifacts
60
+ uses: actions/upload-artifact@v4
61
+ with:
62
+ name: dist
63
+ path: dist/
@@ -0,0 +1,27 @@
1
+ # Build outputs
2
+ dist/
3
+ build/
4
+ *.egg-info/
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *.pyo
10
+
11
+ # Environment
12
+ .env
13
+ .venv/
14
+ venv/
15
+ env/
16
+
17
+ # Type checking
18
+ .mypy_cache/
19
+
20
+ # Test / coverage
21
+ .pytest_cache/
22
+ .coverage
23
+ coverage.xml
24
+ htmlcov/
25
+
26
+ # Editors
27
+ .DS_Store
@@ -0,0 +1,495 @@
1
+ Metadata-Version: 2.4
2
+ Name: code-data-ark
3
+ Version: 2.0.2
4
+ Summary: Code Data Ark — local observability and intelligence platform for VS Code + Copilot Chat sessions
5
+ Project-URL: Homepage, https://github.com/goCosmix/cda
6
+ Project-URL: Repository, https://github.com/goCosmix/cda.git
7
+ Project-URL: Issues, https://github.com/goCosmix/cda/issues
8
+ Project-URL: Documentation, https://github.com/goCosmix/cda#readme
9
+ Project-URL: Changelog, https://github.com/goCosmix/cda/blob/main/changelog.md
10
+ Author-email: Ernie Butcher <ernie@fiosii.com>
11
+ Maintainer-email: Ernie Butcher <ernie@fiosii.com>
12
+ License-Expression: MIT
13
+ License-File: license
14
+ Keywords: ai,analysis,behavioral,chat,conversation,copilot,heat-score,signals,vscode
15
+ Classifier: Development Status :: 3 - Alpha
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
+ Classifier: Topic :: System :: Logging
27
+ Requires-Python: >=3.9
28
+ Requires-Dist: click>=8.0
29
+ Requires-Dist: numpy>=1.26
30
+ Requires-Dist: sentence-transformers>=2.2.2
31
+ Requires-Dist: watchfiles>=0.20
32
+ Provides-Extra: dev
33
+ Requires-Dist: black; extra == 'dev'
34
+ Requires-Dist: flake8; extra == 'dev'
35
+ Requires-Dist: isort; extra == 'dev'
36
+ Requires-Dist: mypy; extra == 'dev'
37
+ Requires-Dist: pytest-cov; extra == 'dev'
38
+ Requires-Dist: pytest>=7.0; extra == 'dev'
39
+ Provides-Extra: test
40
+ Requires-Dist: pytest-cov; extra == 'test'
41
+ Requires-Dist: pytest>=7.0; extra == 'test'
42
+ Description-Content-Type: text/markdown
43
+
44
+ # Code Data Ark
45
+
46
+ [![Python Version](https://img.shields.io/badge/python-3.9%2B-blue.svg)](https://www.python.org/downloads/)
47
+ [![PyPI](https://img.shields.io/pypi/v/code-data-ark.svg)](https://pypi.org/project/code-data-ark)
48
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
49
+
50
+ **Code Data Ark** (`cda`) is a local observability and intelligence platform for VS Code + GitHub Copilot Chat sessions. It ingests everything VS Code writes to disk — transcripts, tool calls, VFS blobs, workspace state — and runs a multi-stage pipeline to turn that raw activity into structured data you can actually reason about.
51
+
52
+ The core insight is that your chat history is not just logs. It carries behavioral signals: moments you corrected the agent, redirected it, expressed frustration, or confirmed that something finally worked. Ark extracts those signals, scores session quality with a heat model, and surfaces the patterns — so you can understand how you work with AI, not just what was said.
53
+
54
+ On top of that signal layer, Ark builds a semantic intelligence layer: embeddings over all your sessions, full-text and code-symbol search, anomaly alerts, session summaries, and related-session discovery. All of this lives in a local SQLite database, queryable via a 40+ command CLI or a background web dashboard.
55
+
56
+ The runtime is managed by an embedded process kernel (PMF) that supervises the watcher daemon, web UI, and pipeline tasks as background services — giving the whole system a lifecycle you can control without touching a process manager.
57
+
58
+ **In short**: point it at your VS Code data directory, run `cda sync`, and you have a searchable, annotated, semantically indexed record of every Copilot session you've ever had — with behavioral scores and anomaly detection included.
59
+
60
+ ## ✨ Key Capabilities
61
+
62
+ - **Multi-stage pipeline**: ingest → reconstruct → extract → embed — each stage enriches the data further
63
+ - **Behavioral signal detection**: 200+ keyword patterns across 6 signal types; frustration, correction, recovery
64
+ - **Heat scoring**: weighted session quality score (0–100) that tracks arc from friction to resolution
65
+ - **Semantic search**: miniLM embeddings over all sessions for similarity, related-session discovery, and topic clustering
66
+ - **Full-text search**: FTS5 index over all exchanges, tool calls, and code symbols
67
+ - **Live watcher daemon**: monitors VS Code directories, queues changes, replays on crash
68
+ - **Background web UI**: session drilldown, signal summaries, alert views, tool-call detail, VFS inspection
69
+ - **PMF Embedded Kernel**: local service lifecycle management — start, stop, restart, status for all Ark daemons
70
+ - **Export workflows**: JSON, JSONL, and plain-text session export
71
+
72
+ ## 📋 Table of Contents
73
+
74
+ - [Installation](#installation)
75
+ - [Quick Start](#quick-start)
76
+ - [Web UI](#web-ui)
77
+ - [CLI Reference](#cli-reference)
78
+ - [Architecture](#architecture)
79
+ - [Roadmap](#roadmap)
80
+ - [Configuration](#configuration)
81
+ - [Development](#development)
82
+ - [Contributing](#contributing)
83
+ - [License](#license)
84
+
85
+ ## 🚀 Installation
86
+
87
+ ### Prerequisites
88
+
89
+ - Python 3.8+
90
+ - VS Code with the Copilot Chat extension installed
91
+
92
+ ### Install from PyPI
93
+
94
+ ```bash
95
+ pip install code-data-ark
96
+ ```
97
+
98
+ ### Install with pipx
99
+
100
+ ```bash
101
+ pipx install code-data-ark
102
+ ```
103
+
104
+ ### Install from source
105
+
106
+ ```bash
107
+ git clone https://github.com/goCosmix/cda.git
108
+ cd cda/source
109
+ pip install -e .
110
+ ```
111
+
112
+ ### Install development dependencies
113
+
114
+ ```bash
115
+ pip install -e ".[dev]"
116
+ # or
117
+ make install-dev
118
+ ```
119
+
120
+ > The `cda` console command is installed into your active Python environment's `bin` directory. Activate your virtual environment before running `cda`.
121
+
122
+ ## ⚡ Quick Start
123
+
124
+ 1. **Initialize the database**
125
+
126
+ ```bash
127
+ cda sync
128
+ ```
129
+
130
+ 2. **Start the watcher daemon**
131
+
132
+ ```bash
133
+ cda watch start
134
+ ```
135
+
136
+ 3. **Inspect the PMF runtime services**
137
+
138
+ ```bash
139
+ cda pmf services
140
+ ```
141
+
142
+ 4. **Build semantic intelligence**
143
+
144
+ ```bash
145
+ cda embed build
146
+ ```
147
+
148
+ 4. **Start the web UI**
149
+
150
+ ```bash
151
+ cda ui start
152
+ ```
153
+
154
+ 5. **Open your browser**
155
+
156
+ Visit `http://127.0.0.1:10001`
157
+
158
+ ## 🌐 Web UI
159
+
160
+ - **Background service**: `cda ui start`
161
+ - **Stop service**: `cda ui stop`
162
+ - **Service status**: `cda ui status`
163
+ - **Foreground mode**: `cda serve`
164
+
165
+ The web UI includes:
166
+
167
+ - Session drilldown panels and charts
168
+ - Behavioral signal summaries
169
+ - Alert and recommendation views
170
+ - Searchable transcript and tool-call detail
171
+ - File/VFS browsing and raw session inspection
172
+
173
+ ## 🧠 Core Features
174
+
175
+ - Behavioral signals with 200+ keyword patterns across six categories
176
+ - Frustration heat scoring and recovery analytics
177
+ - Full-text search and semantic search with embeddings
178
+ - Code symbol indexing for Python/JS/TS
179
+ - Incremental ingestion with crash-resilient queue replay
180
+ - Export workflows for JSON, JSONL, and text
181
+
182
+ ## 📦 Package and Release
183
+
184
+ - Published on PyPI as `code-data-ark`
185
+ - Current release version: `2.0.2`
186
+ - CLI entry point: `cda`
187
+ - License: MIT
188
+
189
+ ## 🛣 Roadmap
190
+
191
+ See `docs/roadmap.md` for product direction, milestone planning, and release priorities.
192
+
193
+ ## 🤝 Contributing
194
+
195
+ See `contributing.md` for development setup, test guidance, and PR workflow.
196
+
197
+ ## 📜 License
198
+
199
+ This project is licensed under the MIT License.
200
+
201
+ ## 🧠 SQLite limits and mitigation
202
+
203
+ - **Single writer in WAL mode**: the system uses one writer process for ingest/reconstruct/extract/embed and allows many concurrent readers via SQLite WAL.
204
+ - **Large VFS blob handling**: for very large raw artifacts, the clean approach is chunked storage or external file references instead of a single enormous BLOB.
205
+ - **Default 8KB page size / cache**: this code now sets `PRAGMA cache_size=-2000`, `PRAGMA mmap_size=268435456`, and `PRAGMA temp_store=MEMORY` to improve read/cache performance on larger databases.
206
+ - **Further tuning**: rebuild the DB with a larger page size (e.g. `PRAGMA page_size=32768`) if you need more efficient storage for very large session history.
207
+
208
+ ## 🔧 Configuration
209
+
210
+ - **VS Code Data Directory**: By default, assumes macOS paths (`~/Library/Application Support/Code/User`). Override with `export VSCODE_DATA_DIR=/path/to/vscode/data` (e.g., on Linux: `~/.config/Code/User`).
211
+ - **No other config needed**: Everything is CLI-driven with local SQLite.
212
+
213
+ ## 🏗️ Architecture
214
+
215
+ ```
216
+ VS Code Storage → ingest.py → vfs + sessions + transcripts
217
+
218
+ reconstruct.py → exchanges (structured conversations)
219
+
220
+ extract.py → signals + tokens + heat scores + analysis
221
+
222
+ embed.py → semantic embeddings + summaries + alerts
223
+
224
+ watcher.py → live sync + FTS indexing + queue resilience
225
+
226
+ cda → query interface + policy enforcement
227
+ ```
228
+
229
+ ### Core Components
230
+
231
+ | Component | Purpose | Key Features |
232
+ |-----------|---------|--------------|
233
+ | **pipeline/ingest.py** | Data ingestion | VFS storage, gzip compression, session metadata |
234
+ | **pipeline/reconstruct.py** | Conversation processing | Exchange threading, tool call linking, FTS indexing |
235
+ | **pipeline/extract.py** | Signal analysis | Behavioral pattern recognition, heat scoring, token accounting |
236
+ | **pipeline/watcher.py** | Live monitoring | File watching, incremental updates, crash recovery |
237
+ | **pipeline/embed.py** | Semantic intelligence | Embeddings, session summaries, anomaly alerts |
238
+ | **kernel/pmf_kernel.py** | Service management | Daemon lifecycle, PID/log tracking, runtime state |
239
+ | **kernel/selfcheck.py** | System diagnostics | Health checks, install validation, DB integrity |
240
+ | **ui/cli.py** | CLI entry point | 40+ commands, policy filtering, rich formatting |
241
+ | **ui/web.py** | Web dashboard | Browser UI for all CLI features, service control |
242
+
243
+ ### Database Schema
244
+
245
+ - **workspaces** - VS Code workspace metadata
246
+ - **sessions** - Chat session information and metadata
247
+ - **vfs** - Gzip-compressed file storage with SHA256 hashes
248
+ - **exchanges** - Structured conversation turns with tool calls
249
+ - **exchange_signals** - Behavioral signal annotations
250
+ - **symbols** - Code symbol index (functions, classes, etc.)
251
+ - **token_usage** - Per-request token consumption tracking
252
+ - **compactions** - Context window summarization events
253
+ - **session_analysis** - Aggregated session metrics and heat scores
254
+
255
+ ## 🖥️ CLI Reference
256
+
257
+ ### Core Commands
258
+
259
+ ```bash
260
+ # System Management
261
+ cda status # Show daemon status and queue information
262
+ cda stats # System-wide statistics and coverage
263
+ cda sync # Full data ingestion and rebuild
264
+ cda reconstruct # Rebuild conversations and search index
265
+ cda pmf services # List embedded PMF runtime services
266
+ cda pmf status [service] # Show runtime status for PMF services
267
+ cda pmf start <service> # Start a PMF-managed Ark service
268
+ cda pmf stop <service> # Stop a PMF-managed Ark service
269
+ cda pmf restart <service> # Restart a PMF-managed Ark service
270
+ cda pmf logs <service> # Tail runtime logs for a PMF service
271
+
272
+ # Session Analysis
273
+ cda sessions # List all sessions (newest first)
274
+ cda session <id> # Show detailed session information
275
+ cda workspace <id> # Show sessions for a workspace
276
+ cda workspaces # List all workspaces
277
+
278
+ # Search & Query
279
+ cda search <query> # Full-text search across conversations
280
+ cda code-search <pattern> [--symbol] [--regex] # Search code symbols or code content
281
+ cda semantic-search <query> # Semantic search using embeddings
282
+ cda similar <session> # Find sessions similar to a session
283
+ cda related <session> # Alias for semantic related sessions
284
+ cda summarize <session> # Show session summary, topics, and recommendations
285
+ cda topics # List semantic topic tags
286
+ cda alerts <session> # Show semantic anomaly alerts
287
+ cda recommend <session> # Show session recommendations
288
+ cda tools <query> # Search tool call arguments
289
+ cda memory # Show memory files and global state
290
+
291
+ # Behavioral Analysis
292
+ cda signals [session] # Show behavioral signals
293
+ cda heat [session] # Frustration and heat analysis
294
+ cda behavior # Aggregate behavioral intelligence
295
+ cda saved # Sessions that recovered from high heat
296
+
297
+ # Data Export
298
+ cda export <session> # Export session as JSON/JSONL/text
299
+ cda replay <session> # Print conversation as readable text
300
+
301
+ # Advanced
302
+ cda query <sql> # Execute raw SQL queries
303
+ cda tokens [session] # Token usage analysis
304
+ cda compactions [session] # Context compaction events
305
+ cda edits # Edit session analytics
306
+
307
+ # Policy Management
308
+ cda policy allow <pattern> # Add allow pattern
309
+ cda policy deny <pattern> # Add deny pattern
310
+ cda policy list # Show current policies
311
+
312
+ # Live Monitoring
313
+ cda watch start # Start watcher daemon
314
+ cda watch stop # Stop watcher daemon
315
+ cda watch restart # Restart watcher daemon
316
+ cda ui start # Start web UI background service
317
+ cda ui stop # Stop web UI background service
318
+ cda ui status # Show web UI background service status
319
+ ```
320
+
321
+ ### Command Examples
322
+
323
+ ```bash
324
+ # Search for error handling discussions
325
+ cda search "error handling" --limit 20
326
+
327
+ # Find sessions with high frustration
328
+ cda heat --limit 10
329
+
330
+ # Search for specific functions in code
331
+ cda code-search "def process_data" --symbol
332
+
333
+ # Search code content with regex or plain text
334
+ cda code-search "timeout" --regex
335
+
336
+ # Find semantically related sessions
337
+ cda related abc123
338
+
339
+ # Summarize a session with semantic topics and recommendations
340
+ cda summarize abc123
341
+
342
+ # Export a session for external analysis
343
+ cda export abc123 --format jsonl --output session.jsonl
344
+
345
+ # Monitor live sessions
346
+ cda watch start
347
+ cda status # Check queue status
348
+ ```
349
+
350
+ ## 📊 Data Analysis
351
+
352
+ ### Behavioral Signals
353
+
354
+ The system recognizes 6 signal types with 200+ keyword patterns:
355
+
356
+ | Signal Type | Weight | Description | Example Keywords |
357
+ |-------------|--------|-------------|------------------|
358
+ | **correction** | 3 | User correcting agent behavior | "stop", "wrong", "nope", "wait" |
359
+ | **pre_correction** | 2 | Early frustration signs | "actually", "hold on", "slow down" |
360
+ | **redirect** | 1 | User changing direction | "pivot", "change direction", "instead" |
361
+ | **affirmation** | 0 | Positive feedback | "good", "right", "perfect", "thanks" |
362
+ | **approval** | 0 | Task completion approval | "that works", "looks good", "approved" |
363
+ | **frustration** | 5 | Strong negative signals | "this is broken", "not working", "terrible" |
364
+
365
+ ### Heat Score Algorithm
366
+
367
+ ```
368
+ Heat Score = min(100, Σ(signal_weights))
369
+ ```
370
+
371
+ - **Peak Heat**: Maximum heat reached in session
372
+ - **Final Heat**: Heat at session end
373
+ - **Recovery**: Sessions that return to low heat after high peaks
374
+ - **Saved Sessions**: High-heat sessions that recover with affirmations
375
+
376
+ ### Token Usage Tracking
377
+
378
+ - Per-request token consumption (prompt + completion)
379
+ - Model identification and version tracking
380
+ - Context compaction event logging
381
+ - Cost estimation capabilities
382
+
383
+ ## ⚙️ Configuration
384
+
385
+ ### Automatic Detection
386
+
387
+ Code Data Ark automatically detects paths using standard locations:
388
+
389
+ - **macOS**: `~/Library/Application Support/Code/User/`
390
+ - **Windows**: `%APPDATA%\Code\User\`
391
+ - **Linux**: `~/.config/Code/User/`
392
+
393
+ ### Environment Variables
394
+
395
+ ```bash
396
+ export CDA_DB=/path/to/custom.db # Custom database location
397
+ export CDA_CONFIG=/path/to/config # Custom config directory
398
+ ```
399
+
400
+ ### Policy Configuration
401
+
402
+ Data access policies are stored in `policy.txt`:
403
+
404
+ ```
405
+ ALLOW important-project
406
+ DENY sensitive-data
407
+ ALLOW *.py
408
+ ```
409
+
410
+ ## 🔧 Development
411
+
412
+ ### Setup Development Environment
413
+
414
+ ```bash
415
+ pip install -e ".[dev]"
416
+ ```
417
+
418
+ ### Running Tests
419
+
420
+ ```bash
421
+ pytest tests/ -q
422
+ ```
423
+
424
+ ### Code Quality
425
+
426
+ ```bash
427
+ flake8 cda tests
428
+ mypy cda
429
+ ```
430
+
431
+ ### Building
432
+
433
+ ```bash
434
+ python -m build
435
+ ```
436
+
437
+ ### Project Structure
438
+
439
+ ```
440
+ cda/
441
+ ├── .gitignore
442
+ ├── source/ # all tracked code (pushed to git)
443
+ │ ├── cda/
444
+ │ │ ├── pipeline/ # ingest, reconstruct, extract, embed, watcher, parse_edits
445
+ │ │ ├── ui/ # cli, web
446
+ │ │ └── kernel/ # pmf_kernel, selfcheck
447
+ │ ├── bin/release.py
448
+ │ ├── tests/
449
+ │ ├── docs/
450
+ │ └── pyproject.toml
451
+ ├── local/ # runtime state (gitignored, host-only)
452
+ │ ├── data/ # cda.db
453
+ │ ├── logs/
454
+ │ ├── queue/
455
+ │ ├── run/
456
+ │ ├── config/
457
+ │ └── pmf/
458
+ └── control/ # management artifacts (gitignored, host-only)
459
+ ├── data/ # control.db
460
+ ├── scripts/
461
+ ├── audit/
462
+ └── scan/
463
+ ```
464
+
465
+ ## 🤝 Contributing
466
+
467
+ 1. Fork the repository
468
+ 2. Create a feature branch: `git checkout -b feature/amazing-feature`
469
+ 3. Make your changes and add tests
470
+ 4. Run the test suite: `make test`
471
+ 5. Format code: `make format`
472
+ 6. Commit your changes: `git commit -m 'Add amazing feature'`
473
+ 7. Push to the branch: `git push origin feature/amazing-feature`
474
+ 8. Open a Pull Request
475
+
476
+ ### Development Guidelines
477
+
478
+ - **Tests**: Unit tests for all new functionality
479
+ - **Linting**: Code must pass `flake8` and `mypy` before pushing
480
+ - **Versioning**: Keep `version`, `pyproject.toml`, and `changelog.md` in sync
481
+
482
+ ## 📝 License
483
+
484
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
485
+
486
+ ## 🙏 Acknowledgments
487
+
488
+ - Built for analyzing VS Code/Copilot Chat interaction patterns
489
+ - Inspired by the need for better human-AI interaction insights
490
+ - Uses SQLite FTS5 for high-performance full-text search
491
+ - Implements behavioral signal processing for conversation analysis
492
+
493
+ ---
494
+
495
+ **Code Data Ark** (`cda`) - Understanding the human side of AI conversations.
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import re
4
+ import subprocess
5
+ from pathlib import Path
6
+
7
+ ROOT = Path(__file__).resolve().parent.parent
8
+ VERSION_FILE = ROOT / "version"
9
+ PYPROJECT_FILE = ROOT / "pyproject.toml"
10
+ INIT_FILE = ROOT / "cda" / "__init__.py"
11
+
12
+ VERSION_PATTERN = r"\d+\.\d+\.\d+"
13
+
14
+
15
+ def read_version():
16
+ text = VERSION_FILE.read_text().strip()
17
+ if not re.fullmatch(VERSION_PATTERN, text):
18
+ raise SystemExit(f"VERSION file must contain a semantic version, found: {text}")
19
+ return text
20
+
21
+
22
+ def write_version(version: str):
23
+ VERSION_FILE.write_text(f"{version}\n")
24
+
25
+
26
+ def replace_in_file(path: Path, pattern: str, replacement: str, multiline: bool = False):
27
+ text = path.read_text()
28
+ flags = re.MULTILINE if multiline else 0
29
+ new_text, count = re.subn(pattern, replacement, text, flags=flags)
30
+ if count == 0:
31
+ raise SystemExit(f"Pattern not found in {path}: {pattern}")
32
+ path.write_text(new_text)
33
+
34
+
35
+ def sync_version(version: str):
36
+ replace_in_file(PYPROJECT_FILE, r'^(version\s*=\s*")' + VERSION_PATTERN + r'(")', rf'\g<1>{version}\g<2>', multiline=True)
37
+ replace_in_file(INIT_FILE, r'^(\s*__version__\s*=\s*")' + VERSION_PATTERN + r'(")', rf'\g<1>{version}\g<2>', multiline=True)
38
+
39
+
40
+ def git_command(args, check=True):
41
+ subprocess.run(["git"] + args, cwd=ROOT, check=check)
42
+
43
+
44
+ def build_package():
45
+ subprocess.run(["python", "-m", "build", "--sdist", "--wheel"], cwd=ROOT, check=True)
46
+
47
+
48
+ def publish_package():
49
+ subprocess.run(["python", "-m", "twine", "upload", "dist/*"], cwd=ROOT, check=True)
50
+
51
+
52
+ def main():
53
+ parser = argparse.ArgumentParser(description="Release management for code-data-ark")
54
+ parser.add_argument("--set-version", help="Set a new version and update all version sources")
55
+ parser.add_argument("--sync", action="store_true", help="Sync version sources from VERSION file")
56
+ parser.add_argument("--tag", action="store_true", help="Create a git tag for the current version")
57
+ parser.add_argument("--push", action="store_true", help="Push current branch and tags to origin")
58
+ parser.add_argument("--build", action="store_true", help="Build source and wheel distributions")
59
+ parser.add_argument("--publish", action="store_true", help="Publish built distributions to PyPI")
60
+ args = parser.parse_args()
61
+
62
+ version = args.set_version or read_version()
63
+ if args.set_version:
64
+ write_version(version)
65
+
66
+ if args.sync or args.set_version:
67
+ sync_version(version)
68
+
69
+ if args.tag:
70
+ git_command(["tag", "-a", f"v{version}", "-m", f"Release v{version}"])
71
+
72
+ if args.build:
73
+ build_package()
74
+
75
+ if args.publish:
76
+ publish_package()
77
+
78
+ if args.push:
79
+ git_command(["push", "origin", "HEAD"])
80
+ if args.tag:
81
+ git_command(["push", "origin", "--tags"])
82
+
83
+ print(f"Release process completed for version {version}.")
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
@@ -0,0 +1,3 @@
1
+ """Code Data Ark — local observability and intelligence platform for VS Code + Copilot Chat sessions."""
2
+
3
+ __version__ = "2.0.2"
File without changes