deco-assaying 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. deco_assaying-0.1.1/.github/workflows/release.yml +74 -0
  2. deco_assaying-0.1.1/.gitignore +12 -0
  3. deco_assaying-0.1.1/.python-version +1 -0
  4. deco_assaying-0.1.1/Dockerfile +32 -0
  5. deco_assaying-0.1.1/PKG-INFO +234 -0
  6. deco_assaying-0.1.1/README.md +219 -0
  7. deco_assaying-0.1.1/docker-compose.yml +22 -0
  8. deco_assaying-0.1.1/docs/deco-assaying-architecture.md +218 -0
  9. deco_assaying-0.1.1/humans_notes.md +24 -0
  10. deco_assaying-0.1.1/pyproject.toml +60 -0
  11. deco_assaying-0.1.1/src/deco_assaying/__init__.py +6 -0
  12. deco_assaying-0.1.1/src/deco_assaying/__main__.py +16 -0
  13. deco_assaying-0.1.1/src/deco_assaying/analyze.py +199 -0
  14. deco_assaying-0.1.1/src/deco_assaying/analyzers/__init__.py +56 -0
  15. deco_assaying-0.1.1/src/deco_assaying/analyzers/_base.py +55 -0
  16. deco_assaying-0.1.1/src/deco_assaying/analyzers/_fallback.py +20 -0
  17. deco_assaying-0.1.1/src/deco_assaying/analyzers/_ts_js.py +570 -0
  18. deco_assaying-0.1.1/src/deco_assaying/analyzers/bash.py +140 -0
  19. deco_assaying-0.1.1/src/deco_assaying/analyzers/c.py +5 -0
  20. deco_assaying-0.1.1/src/deco_assaying/analyzers/c_family.py +289 -0
  21. deco_assaying-0.1.1/src/deco_assaying/analyzers/cpp.py +5 -0
  22. deco_assaying-0.1.1/src/deco_assaying/analyzers/csharp.py +256 -0
  23. deco_assaying-0.1.1/src/deco_assaying/analyzers/go.py +405 -0
  24. deco_assaying-0.1.1/src/deco_assaying/analyzers/java.py +253 -0
  25. deco_assaying-0.1.1/src/deco_assaying/analyzers/javascript.py +5 -0
  26. deco_assaying-0.1.1/src/deco_assaying/analyzers/php.py +241 -0
  27. deco_assaying-0.1.1/src/deco_assaying/analyzers/python.py +577 -0
  28. deco_assaying-0.1.1/src/deco_assaying/analyzers/ruby.py +223 -0
  29. deco_assaying-0.1.1/src/deco_assaying/analyzers/rust.py +528 -0
  30. deco_assaying-0.1.1/src/deco_assaying/analyzers/typescript.py +5 -0
  31. deco_assaying-0.1.1/src/deco_assaying/app.py +41 -0
  32. deco_assaying-0.1.1/src/deco_assaying/chunks.py +126 -0
  33. deco_assaying-0.1.1/src/deco_assaying/config.py +59 -0
  34. deco_assaying-0.1.1/src/deco_assaying/detectors.py +148 -0
  35. deco_assaying-0.1.1/src/deco_assaying/github.py +172 -0
  36. deco_assaying-0.1.1/src/deco_assaying/gitlab.py +294 -0
  37. deco_assaying-0.1.1/src/deco_assaying/jobs.py +1023 -0
  38. deco_assaying-0.1.1/src/deco_assaying/languages.py +198 -0
  39. deco_assaying-0.1.1/src/deco_assaying/literals.py +106 -0
  40. deco_assaying-0.1.1/src/deco_assaying/manifest.py +190 -0
  41. deco_assaying-0.1.1/src/deco_assaying/outputs.py +225 -0
  42. deco_assaying-0.1.1/src/deco_assaying/providers.py +38 -0
  43. deco_assaying-0.1.1/src/deco_assaying/retention.py +92 -0
  44. deco_assaying-0.1.1/src/deco_assaying/routes.py +604 -0
  45. deco_assaying-0.1.1/src/deco_assaying/source.py +181 -0
  46. deco_assaying-0.1.1/src/deco_assaying/walker.py +407 -0
  47. deco_assaying-0.1.1/tests/__init__.py +0 -0
  48. deco_assaying-0.1.1/tests/test_analyze_fallback.py +29 -0
  49. deco_assaying-0.1.1/tests/test_analyze_other_langs.py +324 -0
  50. deco_assaying-0.1.1/tests/test_analyze_python.py +136 -0
  51. deco_assaying-0.1.1/tests/test_chunks.py +35 -0
  52. deco_assaying-0.1.1/tests/test_detectors.py +37 -0
  53. deco_assaying-0.1.1/tests/test_github.py +112 -0
  54. deco_assaying-0.1.1/tests/test_gitlab.py +274 -0
  55. deco_assaying-0.1.1/tests/test_http_routes.py +445 -0
  56. deco_assaying-0.1.1/tests/test_index_repo.py +134 -0
  57. deco_assaying-0.1.1/tests/test_languages.py +34 -0
  58. deco_assaying-0.1.1/tests/test_literals.py +55 -0
  59. deco_assaying-0.1.1/tests/test_outputs_http.py +288 -0
  60. deco_assaying-0.1.1/tests/test_retention.py +141 -0
  61. deco_assaying-0.1.1/tests/test_source.py +101 -0
  62. deco_assaying-0.1.1/tests/test_streaming.py +245 -0
  63. deco_assaying-0.1.1/tests/test_walker.py +66 -0
  64. deco_assaying-0.1.1/uv.lock +972 -0
@@ -0,0 +1,74 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags: ['v*']
6
+
7
+ permissions:
8
+ contents: read
9
+ packages: write # GHCR push
10
+ id-token: write # PyPI trusted publishing
11
+
12
+ jobs:
13
+ docker:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up QEMU (arm64 emulation)
19
+ uses: docker/setup-qemu-action@v3
20
+
21
+ - name: Set up Docker Buildx
22
+ uses: docker/setup-buildx-action@v3
23
+
24
+ - name: Log in to GHCR
25
+ uses: docker/login-action@v3
26
+ with:
27
+ registry: ghcr.io
28
+ username: ${{ github.actor }}
29
+ password: ${{ secrets.GITHUB_TOKEN }}
30
+
31
+ - name: Extract metadata (tags + labels)
32
+ id: meta
33
+ uses: docker/metadata-action@v5
34
+ with:
35
+ images: ghcr.io/${{ github.repository }}
36
+ tags: |
37
+ type=semver,pattern={{version}}
38
+ type=semver,pattern={{major}}.{{minor}}
39
+ type=raw,value=latest
40
+
41
+ - name: Build and push (linux/amd64 + linux/arm64)
42
+ uses: docker/build-push-action@v5
43
+ with:
44
+ context: .
45
+ platforms: linux/amd64,linux/arm64
46
+ push: true
47
+ tags: ${{ steps.meta.outputs.tags }}
48
+ labels: ${{ steps.meta.outputs.labels }}
49
+ cache-from: type=gha
50
+ cache-to: type=gha,mode=max
51
+
52
+ pypi:
53
+ runs-on: ubuntu-latest
54
+ environment: pypi # matches the trusted-publisher config on PyPI
55
+ steps:
56
+ - uses: actions/checkout@v4
57
+
58
+ - name: Install uv
59
+ uses: astral-sh/setup-uv@v3
60
+
61
+ - name: Verify tag matches pyproject version
62
+ run: |
63
+ TAG="${GITHUB_REF#refs/tags/v}"
64
+ PROJECT_VERSION=$(uv run python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
65
+ if [ "$TAG" != "$PROJECT_VERSION" ]; then
66
+ echo "tag $TAG != pyproject $PROJECT_VERSION"
67
+ exit 1
68
+ fi
69
+
70
+ - name: Build wheel + sdist
71
+ run: uv build
72
+
73
+ - name: Publish to PyPI
74
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,12 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .venv/
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ .ty_cache/
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .DS_Store
11
+ .claude/
12
+ output/
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,32 @@
1
+ FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # git is needed at runtime for `git clone` of GitHub/GitLab sources.
6
+ # (uv image is debian-slim; git isn't installed by default.)
7
+ RUN apt-get update \
8
+ && apt-get install -y --no-install-recommends git ca-certificates \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Resolve deps first so they cache across source changes.
12
+ COPY pyproject.toml uv.lock ./
13
+ RUN --mount=type=cache,target=/root/.cache/uv \
14
+ uv sync --no-dev --no-install-project
15
+
16
+ # README.md is part of the package metadata (pyproject.toml -> readme).
17
+ # uv sync --no-install-project skipped reading it; the second sync
18
+ # (which installs the project itself) does, so it must be present.
19
+ COPY README.md ./
20
+ COPY src/ src/
21
+ RUN --mount=type=cache,target=/root/.cache/uv \
22
+ uv sync --no-dev
23
+
24
+ ENV PYTHONUNBUFFERED=1 \
25
+ OUTPUT_ROOT=/data \
26
+ PORT=35832 \
27
+ HOST=0.0.0.0
28
+
29
+ EXPOSE 35832
30
+ VOLUME ["/data"]
31
+
32
+ CMD ["uv", "run", "python", "-m", "deco_assaying"]
@@ -0,0 +1,234 @@
1
+ Metadata-Version: 2.4
2
+ Name: deco-assaying
3
+ Version: 0.1.1
4
+ Summary: MCP server that performs tree-sitter-based source code analysis.
5
+ Requires-Python: >=3.13
6
+ Requires-Dist: fastapi
7
+ Requires-Dist: mcp[cli]>=1.27.0
8
+ Requires-Dist: pathspec
9
+ Requires-Dist: pydantic
10
+ Requires-Dist: starlette
11
+ Requires-Dist: tree-sitter
12
+ Requires-Dist: tree-sitter-language-pack<1.6
13
+ Requires-Dist: uvicorn[standard]
14
+ Description-Content-Type: text/markdown
15
+
16
+ # deco-assaying
17
+
18
+ MCP server that performs tree-sitter-based source code analysis. Designed
19
+ to feed structural information about a repo (symbols, imports, references,
20
+ chunks, metrics) into a downstream consumer that maintains a knowledge
21
+ base over many codebases.
22
+
23
+ ## Run
24
+
25
+ Pick the deployment that matches your situation:
26
+
27
+ | Mode | Command | When to use |
28
+ |---|---|---|
29
+ | Daemon — pinned install | [`uv tool install`](#1-daemon--uv-tool-install-pypi) | You'll run it across many sessions; want it on `$PATH`. |
30
+ | Daemon — ephemeral | [`uvx`](#2-daemon--uvx-no-install) | One-off run; don't want anything left on disk. |
31
+ | Container | [`docker run` from GHCR](#3-docker--ghcr) | Ops deployment, compose stack, or want filesystem isolation. |
32
+ | From source | [`uv run`](#4-from-source) | Hacking on the server itself. |
33
+
34
+ ### Prereqs
35
+
36
+ - **uv-based modes** need [`uv`](https://docs.astral.sh/uv/) and `git`.
37
+ uv ships a portable Python 3.13, so no system Python install required.
38
+
39
+ ```bash
40
+ curl -LsSf https://astral.sh/uv/install.sh | sh
41
+ ```
42
+
43
+ - **Docker mode** needs `docker` (or compatible). The image bundles
44
+ Python 3.13 and git; nothing else on the host.
45
+
46
+ ### 1. Daemon — `uv tool install` (PyPI)
47
+
48
+ Installs the `deco-assaying` command on your `$PATH`, isolated in its
49
+ own venv that uv manages.
50
+
51
+ ```bash
52
+ uv tool install deco-assaying
53
+ deco-assaying # starts the server
54
+ ```
55
+
56
+ Update later with `uv tool upgrade deco-assaying`; remove with
57
+ `uv tool uninstall deco-assaying`.
58
+
59
+ ### 2. Daemon — `uvx` (no install)
60
+
61
+ `uvx` resolves the package into a temporary venv and runs the entry
62
+ point in one shot. Nothing persists between runs.
63
+
64
+ ```bash
65
+ uvx deco-assaying # latest release
66
+ uvx deco-assaying@0.1.0 # pin a specific version
67
+ ```
68
+
69
+ Good for kicking the tires or running on a CI box where you don't
70
+ want to touch `~/.local/share/uv`.
71
+
72
+ ### 3. Docker / GHCR
73
+
74
+ Pull and run the published multi-arch image (linux/amd64 +
75
+ linux/arm64):
76
+
77
+ ```bash
78
+ docker pull ghcr.io/garycoding/deco-assaying:latest
79
+ docker run --rm \
80
+ -p 35832:35832 \
81
+ -v deco-assaying-data:/data \
82
+ ghcr.io/garycoding/deco-assaying:latest
83
+ ```
84
+
85
+ Pin a specific version with a tag — `:0.1.0`, `:0.1`, or `:latest`
86
+ (see the [Releases](https://github.com/garycoding/deco-assaying/pkgs/container/deco-assaying)
87
+ page on GHCR for the available tags).
88
+
89
+ Or with compose (see [docker-compose.yml](docker-compose.yml) — pulls
90
+ the image, mounts a named volume at `/data`, restarts on failure):
91
+
92
+ ```bash
93
+ docker compose up -d
94
+ ```
95
+
96
+ The named volume `deco-assaying-data` persists job outputs across
97
+ container restarts. To pass auth tokens for private repos:
98
+
99
+ ```bash
100
+ docker run --rm \
101
+ -e GITHUB_TOKEN=ghp_... \
102
+ -e GITLAB_TOKEN=glpat-... \
103
+ -p 35832:35832 \
104
+ -v deco-assaying-data:/data \
105
+ ghcr.io/garycoding/deco-assaying:latest
106
+ ```
107
+
108
+ ### 4. From source
109
+
110
+ ```bash
111
+ git clone https://github.com/garycoding/deco-assaying.git
112
+ cd deco-assaying
113
+ uv sync
114
+ uv run python -m deco_assaying
115
+ ```
116
+
117
+ ### Endpoints
118
+
119
+ In every mode the server listens on `PORT` (default `35832`) with:
120
+
121
+ - `POST /sse` — MCP Streamable HTTP transport.
122
+ - `GET /health` — liveness probe.
123
+ - `GET /admin/*` — read-only JSON ops endpoints.
124
+ - `GET /outputs/{job_id}/...` — read-only download API for job artifacts.
125
+ - `GET /docs` — OpenAPI / Swagger UI for the HTTP API.
126
+
127
+ Sanity-check it's up:
128
+
129
+ ```bash
130
+ curl http://127.0.0.1:35832/health
131
+ ```
132
+
133
+ ## MCP tools
134
+
135
+ - `analyze_file(content, filename?, language?, options?)` — parse a single
136
+ file passed inline; returns structural JSON.
137
+ - `index_repo(source, options?)` — start a job that indexes a whole repo
138
+ and writes per-file artifacts plus a manifest. The server allocates a
139
+ fresh output dir under `OUTPUT_ROOT` and returns `{ job_id, output_path }`.
140
+ `source` can be a local directory, a GitHub URL
141
+ (`https://github.com/owner/repo`), or a GitLab URL
142
+ (`https://gitlab.com/owner/repo`, including nested groups
143
+ `https://gitlab.com/group/sub/repo`). Pass `git_ref` to pick a specific
144
+ branch / tag / sha.
145
+ - `get_job_status(job_id)` — poll a running or completed job.
146
+ - `cancel_job(job_id)` — cooperative cancel.
147
+ - `list_supported_languages()` — capability discovery.
148
+ - `detect_language(path)` — extension/shebang detection helper.
149
+
150
+ ## Output download API
151
+
152
+ Every job's artifacts land under `OUTPUT_ROOT/{job_id}/`. A consumer
153
+ sharing the volume can read them off disk; one without a shared volume
154
+ can pull them over HTTP:
155
+
156
+ | Endpoint | Returns |
157
+ |---|---|
158
+ | `GET /outputs/{job_id}` | `manifest.json` (convenience). |
159
+ | `GET /outputs/{job_id}/manifest.json` | Repo-level rollup. |
160
+ | `GET /outputs/{job_id}/tree.json` | Full path inventory (analyzed + skipped). |
161
+ | `GET /outputs/{job_id}/symbols.json` | Global qualified-name index. |
162
+ | `GET /outputs/{job_id}/languages.json` | Per-language counts. |
163
+ | `GET /outputs/{job_id}/errors.json` | Parse errors + skipped files. |
164
+ | `GET /outputs/{job_id}/log.jsonl?from_offset=N` | Tail the job's log. |
165
+ | `GET /outputs/{job_id}/ls?path=&recursive=` | Directory listing. |
166
+ | `GET /outputs/{job_id}/file/{path}` | Single file, **or** a streaming ZIP if any path segment contains `*?[`. E.g. `/file/files/**/*.py.json`. |
167
+ | `GET /outputs/{job_id}/zip?path=&match=` | Explicit-bulk-zip alias. Default = whole job dir. |
168
+ | `DELETE /outputs/{job_id}` | Remove the dir + drop the table entry. 409 if still running. |
169
+ | `GET /admin/outputs` | List every job_id present on disk under `OUTPUT_ROOT`. |
170
+
171
+ Path traversal (`..`, absolute paths, escape via symlink) is rejected.
172
+
173
+ ## Resource requirements
174
+
175
+ When `index_repo` runs against a GitHub URL, the server uses a partial
176
+ clone with bin-packed batched fetching. That gives a small, predictable
177
+ disk footprint regardless of how large the source repo is:
178
+
179
+ - **Source-side scratch space: ~100 MB peak** in `output_path/.source/`
180
+ during analysis. The server fetches each batch of source files
181
+ (totaling ≤ `max_partial_clone_bytes`, default 100 MB), analyzes
182
+ them, deletes them from the working tree, then fetches the next
183
+ batch. Even on a multi-GB monorepo, peak local-disk used for source
184
+ content stays at ~100 MB. Tunable via the `max_partial_clone_bytes`
185
+ option on `index_repo`.
186
+
187
+ - **Output artifacts: roughly 1-2× the analyzed-source size.** Each
188
+ analyzed file produces a JSON artifact under `output_path/files/`
189
+ containing symbols, imports, references, chunks, etc. These persist
190
+ past the job — the consumer reads them incrementally — and are
191
+ the largest *durable* footprint. The retention sweeper auto-purges
192
+ job dirs older than `OUTPUT_EXPIRY_DAYS`.
193
+
194
+ - **Memory: modest.** A `ProcessPoolExecutor` runs roughly
195
+ `2 × CPU count` workers, each holding one file's bytes plus its
196
+ tree-sitter parse tree in memory. Source files are capped at
197
+ `max_file_bytes` (default 2 MB), so worst case is ~16-32 MB of
198
+ resident source + parse trees on a typical 8-core box.
199
+
200
+ - **Network:** one provider-API pre-flight to plan the batches (GitHub
201
+ Trees REST or GitLab REST tree + GraphQL; free for public repos, set
202
+ `GITHUB_TOKEN` / `GITLAB_TOKEN` for higher quotas and private-repo
203
+ access), plus one `git fetch-pack` round-trip per batch. For a
204
+ typical sub-100 MB repo that's two HTTP hits total.
205
+
206
+ For local-path sources nothing is fetched and nothing is cloned —
207
+ the only on-disk cost is the output artifacts.
208
+
209
+ ## Configuration
210
+
211
+ | Env var | Default (daemon) | Default (container) | Purpose |
212
+ |---|---|---|---|
213
+ | `PORT` | `35832` | `35832` | HTTP listen port. |
214
+ | `HOST` | `0.0.0.0` | `0.0.0.0` | HTTP bind address. |
215
+ | `OUTPUT_ROOT` | `./output` | `/data` | Where the server writes job dirs. |
216
+ | `OUTPUT_EXPIRY_DAYS` | `7` | `7` | Auto-purge job dirs older than this. `0` disables. |
217
+ | `JOB_HISTORY_MAX` | `100` | `100` | In-memory job-table cap. |
218
+ | `DEFAULT_MAX_FILE_BYTES` | `2097152` | `2097152` | Default per-file size cap. |
219
+ | `DEFAULT_CHUNK_MAX_TOKENS` | `800` | `800` | Default chunk size for cAST chunking. |
220
+ | `GITHUB_TOKEN` | unset | unset | Optional, raises GitHub Trees API quota from 60 to 5000 req/hr and unlocks private repos. |
221
+ | `GITLAB_TOKEN` | unset | unset | Optional, used for GitLab API auth and private-repo access. |
222
+
223
+ ## Releasing
224
+
225
+ Tag-driven. Bump `version` in `pyproject.toml`, then:
226
+
227
+ ```bash
228
+ git tag vX.Y.Z && git push --tags
229
+ ```
230
+
231
+ The `Release` workflow builds a multi-arch image (linux/amd64 +
232
+ linux/arm64) and pushes it to GHCR with `vX.Y.Z`, `vX.Y`, and `latest`
233
+ tags, in parallel with publishing wheel + sdist to PyPI via trusted
234
+ publishing. ~3-5 minutes end-to-end.
@@ -0,0 +1,219 @@
1
+ # deco-assaying
2
+
3
+ MCP server that performs tree-sitter-based source code analysis. Designed
4
+ to feed structural information about a repo (symbols, imports, references,
5
+ chunks, metrics) into a downstream consumer that maintains a knowledge
6
+ base over many codebases.
7
+
8
+ ## Run
9
+
10
+ Pick the deployment that matches your situation:
11
+
12
+ | Mode | Command | When to use |
13
+ |---|---|---|
14
+ | Daemon — pinned install | [`uv tool install`](#1-daemon--uv-tool-install-pypi) | You'll run it across many sessions; want it on `$PATH`. |
15
+ | Daemon — ephemeral | [`uvx`](#2-daemon--uvx-no-install) | One-off run; don't want anything left on disk. |
16
+ | Container | [`docker run` from GHCR](#3-docker--ghcr) | Ops deployment, compose stack, or want filesystem isolation. |
17
+ | From source | [`uv run`](#4-from-source) | Hacking on the server itself. |
18
+
19
+ ### Prereqs
20
+
21
+ - **uv-based modes** need [`uv`](https://docs.astral.sh/uv/) and `git`.
22
+ uv ships a portable Python 3.13, so no system Python install required.
23
+
24
+ ```bash
25
+ curl -LsSf https://astral.sh/uv/install.sh | sh
26
+ ```
27
+
28
+ - **Docker mode** needs `docker` (or compatible). The image bundles
29
+ Python 3.13 and git; nothing else on the host.
30
+
31
+ ### 1. Daemon — `uv tool install` (PyPI)
32
+
33
+ Installs the `deco-assaying` command on your `$PATH`, isolated in its
34
+ own venv that uv manages.
35
+
36
+ ```bash
37
+ uv tool install deco-assaying
38
+ deco-assaying # starts the server
39
+ ```
40
+
41
+ Update later with `uv tool upgrade deco-assaying`; remove with
42
+ `uv tool uninstall deco-assaying`.
43
+
44
+ ### 2. Daemon — `uvx` (no install)
45
+
46
+ `uvx` resolves the package into a temporary venv and runs the entry
47
+ point in one shot. Nothing persists between runs.
48
+
49
+ ```bash
50
+ uvx deco-assaying # latest release
51
+ uvx deco-assaying@0.1.0 # pin a specific version
52
+ ```
53
+
54
+ Good for kicking the tires or running on a CI box where you don't
55
+ want to touch `~/.local/share/uv`.
56
+
57
+ ### 3. Docker / GHCR
58
+
59
+ Pull and run the published multi-arch image (linux/amd64 +
60
+ linux/arm64):
61
+
62
+ ```bash
63
+ docker pull ghcr.io/garycoding/deco-assaying:latest
64
+ docker run --rm \
65
+ -p 35832:35832 \
66
+ -v deco-assaying-data:/data \
67
+ ghcr.io/garycoding/deco-assaying:latest
68
+ ```
69
+
70
+ Pin a specific version with a tag — `:0.1.0`, `:0.1`, or `:latest`
71
+ (see the [Releases](https://github.com/garycoding/deco-assaying/pkgs/container/deco-assaying)
72
+ page on GHCR for the available tags).
73
+
74
+ Or with compose (see [docker-compose.yml](docker-compose.yml) — pulls
75
+ the image, mounts a named volume at `/data`, restarts on failure):
76
+
77
+ ```bash
78
+ docker compose up -d
79
+ ```
80
+
81
+ The named volume `deco-assaying-data` persists job outputs across
82
+ container restarts. To pass auth tokens for private repos:
83
+
84
+ ```bash
85
+ docker run --rm \
86
+ -e GITHUB_TOKEN=ghp_... \
87
+ -e GITLAB_TOKEN=glpat-... \
88
+ -p 35832:35832 \
89
+ -v deco-assaying-data:/data \
90
+ ghcr.io/garycoding/deco-assaying:latest
91
+ ```
92
+
93
+ ### 4. From source
94
+
95
+ ```bash
96
+ git clone https://github.com/garycoding/deco-assaying.git
97
+ cd deco-assaying
98
+ uv sync
99
+ uv run python -m deco_assaying
100
+ ```
101
+
102
+ ### Endpoints
103
+
104
+ In every mode the server listens on `PORT` (default `35832`) with:
105
+
106
+ - `POST /sse` — MCP Streamable HTTP transport.
107
+ - `GET /health` — liveness probe.
108
+ - `GET /admin/*` — read-only JSON ops endpoints.
109
+ - `GET /outputs/{job_id}/...` — read-only download API for job artifacts.
110
+ - `GET /docs` — OpenAPI / Swagger UI for the HTTP API.
111
+
112
+ Sanity-check it's up:
113
+
114
+ ```bash
115
+ curl http://127.0.0.1:35832/health
116
+ ```
117
+
118
+ ## MCP tools
119
+
120
+ - `analyze_file(content, filename?, language?, options?)` — parse a single
121
+ file passed inline; returns structural JSON.
122
+ - `index_repo(source, options?)` — start a job that indexes a whole repo
123
+ and writes per-file artifacts plus a manifest. The server allocates a
124
+ fresh output dir under `OUTPUT_ROOT` and returns `{ job_id, output_path }`.
125
+ `source` can be a local directory, a GitHub URL
126
+ (`https://github.com/owner/repo`), or a GitLab URL
127
+ (`https://gitlab.com/owner/repo`, including nested groups
128
+ `https://gitlab.com/group/sub/repo`). Pass `git_ref` to pick a specific
129
+ branch / tag / sha.
130
+ - `get_job_status(job_id)` — poll a running or completed job.
131
+ - `cancel_job(job_id)` — cooperative cancel.
132
+ - `list_supported_languages()` — capability discovery.
133
+ - `detect_language(path)` — extension/shebang detection helper.
134
+
135
+ ## Output download API
136
+
137
+ Every job's artifacts land under `OUTPUT_ROOT/{job_id}/`. A consumer
138
+ sharing the volume can read them off disk; one without a shared volume
139
+ can pull them over HTTP:
140
+
141
+ | Endpoint | Returns |
142
+ |---|---|
143
+ | `GET /outputs/{job_id}` | `manifest.json` (convenience). |
144
+ | `GET /outputs/{job_id}/manifest.json` | Repo-level rollup. |
145
+ | `GET /outputs/{job_id}/tree.json` | Full path inventory (analyzed + skipped). |
146
+ | `GET /outputs/{job_id}/symbols.json` | Global qualified-name index. |
147
+ | `GET /outputs/{job_id}/languages.json` | Per-language counts. |
148
+ | `GET /outputs/{job_id}/errors.json` | Parse errors + skipped files. |
149
+ | `GET /outputs/{job_id}/log.jsonl?from_offset=N` | Tail the job's log. |
150
+ | `GET /outputs/{job_id}/ls?path=&recursive=` | Directory listing. |
151
+ | `GET /outputs/{job_id}/file/{path}` | Single file, **or** a streaming ZIP if any path segment contains `*?[`. E.g. `/file/files/**/*.py.json`. |
152
+ | `GET /outputs/{job_id}/zip?path=&match=` | Explicit-bulk-zip alias. Default = whole job dir. |
153
+ | `DELETE /outputs/{job_id}` | Remove the dir + drop the table entry. 409 if still running. |
154
+ | `GET /admin/outputs` | List every job_id present on disk under `OUTPUT_ROOT`. |
155
+
156
+ Path traversal (`..`, absolute paths, escape via symlink) is rejected.
157
+
158
+ ## Resource requirements
159
+
160
+ When `index_repo` runs against a GitHub URL, the server uses a partial
161
+ clone with bin-packed batched fetching. That gives a small, predictable
162
+ disk footprint regardless of how large the source repo is:
163
+
164
+ - **Source-side scratch space: ~100 MB peak** in `output_path/.source/`
165
+ during analysis. The server fetches each batch of source files
166
+ (totaling ≤ `max_partial_clone_bytes`, default 100 MB), analyzes
167
+ them, deletes them from the working tree, then fetches the next
168
+ batch. Even on a multi-GB monorepo, peak local-disk used for source
169
+ content stays at ~100 MB. Tunable via the `max_partial_clone_bytes`
170
+ option on `index_repo`.
171
+
172
+ - **Output artifacts: roughly 1-2× the analyzed-source size.** Each
173
+ analyzed file produces a JSON artifact under `output_path/files/`
174
+ containing symbols, imports, references, chunks, etc. These persist
175
+ past the job — the consumer reads them incrementally — and are
176
+ the largest *durable* footprint. The retention sweeper auto-purges
177
+ job dirs older than `OUTPUT_EXPIRY_DAYS`.
178
+
179
+ - **Memory: modest.** A `ProcessPoolExecutor` runs roughly
180
+ `2 × CPU count` workers, each holding one file's bytes plus its
181
+ tree-sitter parse tree in memory. Source files are capped at
182
+ `max_file_bytes` (default 2 MB), so worst case is ~16-32 MB of
183
+ resident source + parse trees on a typical 8-core box.
184
+
185
+ - **Network:** one provider-API pre-flight to plan the batches (GitHub
186
+ Trees REST or GitLab REST tree + GraphQL; free for public repos, set
187
+ `GITHUB_TOKEN` / `GITLAB_TOKEN` for higher quotas and private-repo
188
+ access), plus one `git fetch-pack` round-trip per batch. For a
189
+ typical sub-100 MB repo that's two HTTP hits total.
190
+
191
+ For local-path sources nothing is fetched and nothing is cloned —
192
+ the only on-disk cost is the output artifacts.
193
+
194
+ ## Configuration
195
+
196
+ | Env var | Default (daemon) | Default (container) | Purpose |
197
+ |---|---|---|---|
198
+ | `PORT` | `35832` | `35832` | HTTP listen port. |
199
+ | `HOST` | `0.0.0.0` | `0.0.0.0` | HTTP bind address. |
200
+ | `OUTPUT_ROOT` | `./output` | `/data` | Where the server writes job dirs. |
201
+ | `OUTPUT_EXPIRY_DAYS` | `7` | `7` | Auto-purge job dirs older than this. `0` disables. |
202
+ | `JOB_HISTORY_MAX` | `100` | `100` | In-memory job-table cap. |
203
+ | `DEFAULT_MAX_FILE_BYTES` | `2097152` | `2097152` | Default per-file size cap. |
204
+ | `DEFAULT_CHUNK_MAX_TOKENS` | `800` | `800` | Default chunk size for cAST chunking. |
205
+ | `GITHUB_TOKEN` | unset | unset | Optional, raises GitHub Trees API quota from 60 to 5000 req/hr and unlocks private repos. |
206
+ | `GITLAB_TOKEN` | unset | unset | Optional, used for GitLab API auth and private-repo access. |
207
+
208
+ ## Releasing
209
+
210
+ Tag-driven. Bump `version` in `pyproject.toml`, then:
211
+
212
+ ```bash
213
+ git tag vX.Y.Z && git push --tags
214
+ ```
215
+
216
+ The `Release` workflow builds a multi-arch image (linux/amd64 +
217
+ linux/arm64) and pushes it to GHCR with `vX.Y.Z`, `vX.Y`, and `latest`
218
+ tags, in parallel with publishing wheel + sdist to PyPI via trusted
219
+ publishing. ~3-5 minutes end-to-end.
@@ -0,0 +1,22 @@
1
+ services:
2
+ deco-assaying:
3
+ image: ghcr.io/garycoding/deco-assaying:latest
4
+ container_name: deco-assaying
5
+ restart: unless-stopped
6
+ ports:
7
+ - "35832:35832"
8
+ volumes:
9
+ - deco-assaying-data:/data
10
+ environment:
11
+ OUTPUT_ROOT: /data
12
+ OUTPUT_EXPIRY_DAYS: "7"
13
+ # Uncomment to authenticate to private repos / raise rate limits:
14
+ # GITHUB_TOKEN: ${GITHUB_TOKEN:-}
15
+ # GITLAB_TOKEN: ${GITLAB_TOKEN:-}
16
+ extra_hosts:
17
+ # Lets the consumer running on the docker host reach the server
18
+ # under host.docker.internal on Linux too.
19
+ - "host.docker.internal:host-gateway"
20
+
21
+ volumes:
22
+ deco-assaying-data: