nophi-ui 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nophi_ui-0.1.0/LICENSE +21 -0
- nophi_ui-0.1.0/PKG-INFO +61 -0
- nophi_ui-0.1.0/README.md +43 -0
- nophi_ui-0.1.0/nophi_ui/__init__.py +10 -0
- nophi_ui-0.1.0/nophi_ui/__main__.py +48 -0
- nophi_ui-0.1.0/nophi_ui/api.py +228 -0
- nophi_ui-0.1.0/nophi_ui/engines.py +134 -0
- nophi_ui-0.1.0/nophi_ui/folderpicker.py +55 -0
- nophi_ui-0.1.0/nophi_ui/jobs.py +357 -0
- nophi_ui-0.1.0/nophi_ui/logconf.py +25 -0
- nophi_ui-0.1.0/nophi_ui/server.py +63 -0
- nophi_ui-0.1.0/nophi_ui/static/app.js +300 -0
- nophi_ui-0.1.0/nophi_ui/static/index.html +108 -0
- nophi_ui-0.1.0/nophi_ui/static/style.css +78 -0
- nophi_ui-0.1.0/nophi_ui.egg-info/PKG-INFO +61 -0
- nophi_ui-0.1.0/nophi_ui.egg-info/SOURCES.txt +20 -0
- nophi_ui-0.1.0/nophi_ui.egg-info/dependency_links.txt +1 -0
- nophi_ui-0.1.0/nophi_ui.egg-info/entry_points.txt +2 -0
- nophi_ui-0.1.0/nophi_ui.egg-info/requires.txt +5 -0
- nophi_ui-0.1.0/nophi_ui.egg-info/top_level.txt +1 -0
- nophi_ui-0.1.0/pyproject.toml +32 -0
- nophi_ui-0.1.0/setup.cfg +4 -0
nophi_ui-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 no-phi contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
nophi_ui-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nophi-ui
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local web review interface for nophi / nophi-av PHI redaction
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/kshen3778/no-phi
|
|
7
|
+
Project-URL: Repository, https://github.com/kshen3778/no-phi
|
|
8
|
+
Keywords: phi,pii,redaction,review,ui
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: nophi>=0.1
|
|
13
|
+
Requires-Dist: nophi-av>=0.1
|
|
14
|
+
Requires-Dist: fastapi>=0.110
|
|
15
|
+
Requires-Dist: uvicorn[standard]>=0.29
|
|
16
|
+
Requires-Dist: python-multipart>=0.0.9
|
|
17
|
+
Dynamic: license-file
|
|
18
|
+
|
|
19
|
+
# nophi-ui
|
|
20
|
+
|
|
21
|
+
A local web review interface for the [`nophi`](../nophi) (documents) and
|
|
22
|
+
[`nophi-av`](../nophi-av) (audio/video) PHI redaction engines.
|
|
23
|
+
|
|
24
|
+
It is a thin FastAPI frontend — the redaction logic stays in the two engines.
|
|
25
|
+
It lets you run detection from the browser, **remove false-positive detections**
|
|
26
|
+
(and, for audio, add a missed segment by hand) before redaction is applied, and
|
|
27
|
+
view the redacted result.
|
|
28
|
+
|
|
29
|
+
## Run
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
nophi-ui # opens http://127.0.0.1:8000
|
|
33
|
+
nophi-ui --port 9000 --no-open
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
You type a server-side **input directory** and **output directory** (raw paths),
|
|
37
|
+
preview the files that will be processed, then start detection.
|
|
38
|
+
|
|
39
|
+
## What it does
|
|
40
|
+
|
|
41
|
+
- **Documents** (`.txt .csv .docx .xlsx .pdf`): detect → review the detection
|
|
42
|
+
list → uncheck false positives → apply. PDF previews inline; docx/xlsx are
|
|
43
|
+
download-only.
|
|
44
|
+
- **Audio**: detect → review (play the original clip per detection) → uncheck
|
|
45
|
+
false positives and/or add missed `start/end` segments → apply (re-scrubs from
|
|
46
|
+
the original; no re-transcription).
|
|
47
|
+
- **Video**: view-only. Redacted in one shot; detections shown for reference.
|
|
48
|
+
|
|
49
|
+
## Security
|
|
50
|
+
|
|
51
|
+
This tool serves PHI, so by design it:
|
|
52
|
+
|
|
53
|
+
- binds to **`127.0.0.1` only** (refuses other hosts),
|
|
54
|
+
- locks **CORS** to its own origin,
|
|
55
|
+
- requires a **per-launch token** on every API call,
|
|
56
|
+
- serves files by **opaque job/file id** (never a client-supplied path),
|
|
57
|
+
- marks PHI responses **`Cache-Control: no-store`** and serves only the clipped
|
|
58
|
+
segment for audio review.
|
|
59
|
+
|
|
60
|
+
State is held **in memory** for the process lifetime; closing the server clears
|
|
61
|
+
it (a restart means re-running detection).
|
nophi_ui-0.1.0/README.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# nophi-ui
|
|
2
|
+
|
|
3
|
+
A local web review interface for the [`nophi`](../nophi) (documents) and
|
|
4
|
+
[`nophi-av`](../nophi-av) (audio/video) PHI redaction engines.
|
|
5
|
+
|
|
6
|
+
It is a thin FastAPI frontend — the redaction logic stays in the two engines.
|
|
7
|
+
It lets you run detection from the browser, **remove false-positive detections**
|
|
8
|
+
(and, for audio, add a missed segment by hand) before redaction is applied, and
|
|
9
|
+
view the redacted result.
|
|
10
|
+
|
|
11
|
+
## Run
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
nophi-ui # opens http://127.0.0.1:8000
|
|
15
|
+
nophi-ui --port 9000 --no-open
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
You type a server-side **input directory** and **output directory** (raw paths),
|
|
19
|
+
preview the files that will be processed, then start detection.
|
|
20
|
+
|
|
21
|
+
## What it does
|
|
22
|
+
|
|
23
|
+
- **Documents** (`.txt .csv .docx .xlsx .pdf`): detect → review the detection
|
|
24
|
+
list → uncheck false positives → apply. PDF previews inline; docx/xlsx are
|
|
25
|
+
download-only.
|
|
26
|
+
- **Audio**: detect → review (play the original clip per detection) → uncheck
|
|
27
|
+
false positives and/or add missed `start/end` segments → apply (re-scrubs from
|
|
28
|
+
the original; no re-transcription).
|
|
29
|
+
- **Video**: view-only. Redacted in one shot; detections shown for reference.
|
|
30
|
+
|
|
31
|
+
## Security
|
|
32
|
+
|
|
33
|
+
This tool serves PHI, so by design it:
|
|
34
|
+
|
|
35
|
+
- binds to **`127.0.0.1` only** (refuses other hosts),
|
|
36
|
+
- locks **CORS** to its own origin,
|
|
37
|
+
- requires a **per-launch token** on every API call,
|
|
38
|
+
- serves files by **opaque job/file id** (never a client-supplied path),
|
|
39
|
+
- marks PHI responses **`Cache-Control: no-store`** and serves only the clipped
|
|
40
|
+
segment for audio review.
|
|
41
|
+
|
|
42
|
+
State is held **in memory** for the process lifetime; closing the server clears
|
|
43
|
+
it (a restart means re-running detection).
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Local web review interface for nophi / nophi-av PHI redaction.
|
|
2
|
+
|
|
3
|
+
A thin FastAPI frontend over the two redaction engines. The redaction logic
|
|
4
|
+
lives in ``nophi`` (documents) and ``nophi_av`` (audio/video); this package only
|
|
5
|
+
routes files to the right engine, normalizes findings for the browser, runs
|
|
6
|
+
detection as a background job, and lets the user remove false-positive detections
|
|
7
|
+
before redaction is applied.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# `nophi-ui` entry point: mint a per-launch token, build the app, and serve it on
|
|
2
|
+
# loopback only. The user opens the printed URL; the page picks up the token and
|
|
3
|
+
# every API call carries it.
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import secrets
|
|
8
|
+
import threading
|
|
9
|
+
import webbrowser
|
|
10
|
+
|
|
11
|
+
import uvicorn
|
|
12
|
+
|
|
13
|
+
from nophi_ui.logconf import configure, log
|
|
14
|
+
from nophi_ui.server import create_app
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main() -> None:
|
|
18
|
+
parser = argparse.ArgumentParser(prog="nophi-ui", description="Local PHI redaction review UI")
|
|
19
|
+
parser.add_argument("--host", default="127.0.0.1", help="Bind host (default 127.0.0.1; loopback only)")
|
|
20
|
+
parser.add_argument("--port", type=int, default=8000, help="Bind port (default 8000)")
|
|
21
|
+
parser.add_argument("--no-open", action="store_true", help="Do not open a browser automatically")
|
|
22
|
+
parser.add_argument("--log-level", default="info", help="Console log level: debug, info, warning, error")
|
|
23
|
+
args = parser.parse_args()
|
|
24
|
+
|
|
25
|
+
configure(args.log_level)
|
|
26
|
+
|
|
27
|
+
if args.host not in ("127.0.0.1", "localhost", "::1"):
|
|
28
|
+
print(f"⚠ Refusing to bind to {args.host!r}: nophi-ui serves PHI and must stay on loopback.")
|
|
29
|
+
print(" Use 127.0.0.1 (the default).")
|
|
30
|
+
raise SystemExit(2)
|
|
31
|
+
|
|
32
|
+
token = secrets.token_urlsafe(24)
|
|
33
|
+
origins = [f"http://127.0.0.1:{args.port}", f"http://localhost:{args.port}"]
|
|
34
|
+
app = create_app(token, origins)
|
|
35
|
+
|
|
36
|
+
url = f"http://127.0.0.1:{args.port}/"
|
|
37
|
+
print(f"\n nophi-ui → {url}")
|
|
38
|
+
print(" (loopback only; close this process to stop and clear all in-memory state)\n")
|
|
39
|
+
log.info(f"serving on {url} (loopback only)")
|
|
40
|
+
|
|
41
|
+
if not args.no_open:
|
|
42
|
+
threading.Timer(1.0, lambda: webbrowser.open(url)).start()
|
|
43
|
+
|
|
44
|
+
uvicorn.run(app, host=args.host, port=args.port, log_level="warning")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if __name__ == "__main__":
|
|
48
|
+
main()
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# HTTP API for the review UI. JSON endpoints for the scan/review/apply flow plus
|
|
2
|
+
# file-serving endpoints (redacted result, original audio clip, report). All
|
|
3
|
+
# routes are mounted under /api and protected by the startup-token middleware in
|
|
4
|
+
# server.py. PHI responses are marked no-store.
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import tempfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from fastapi import APIRouter, HTTPException
|
|
12
|
+
from fastapi.responses import FileResponse
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
from starlette.background import BackgroundTask
|
|
15
|
+
|
|
16
|
+
from nophi_av.audio import clip_segment
|
|
17
|
+
|
|
18
|
+
from nophi_ui import engines, jobs, folderpicker
|
|
19
|
+
from nophi_ui.logconf import log
|
|
20
|
+
|
|
21
|
+
router = APIRouter(prefix="/api")
|
|
22
|
+
|
|
23
|
+
_NO_STORE = {"Cache-Control": "no-store"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ── Request models ───────────────────────────────────────────────────────────
|
|
27
|
+
class PreviewReq(BaseModel):
|
|
28
|
+
input_dir: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class JobReq(BaseModel):
|
|
32
|
+
input_dir: str
|
|
33
|
+
output_dir: str
|
|
34
|
+
settings: dict = {}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class AddedInterval(BaseModel):
|
|
38
|
+
entity_type: str = "MANUAL"
|
|
39
|
+
start_time: float
|
|
40
|
+
end_time: float
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ApplyReq(BaseModel):
|
|
44
|
+
removed: list[int] = []
|
|
45
|
+
added: list[AddedInterval] = []
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ── Helpers ──────────────────────────────────────────────────────────────────
|
|
49
|
+
def _dir(path_str: str, *, must_exist: bool) -> Path:
|
|
50
|
+
p = Path(path_str).expanduser()
|
|
51
|
+
if must_exist and not p.is_dir():
|
|
52
|
+
raise HTTPException(400, f"Not a directory: {p}")
|
|
53
|
+
return p
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _get_job(job_id: str) -> jobs.Job:
|
|
57
|
+
job = jobs.JOBS.get(job_id)
|
|
58
|
+
if job is None:
|
|
59
|
+
raise HTTPException(404, "job not found")
|
|
60
|
+
return job
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _get_file(job_id: str, file_id: str):
|
|
64
|
+
job = _get_job(job_id)
|
|
65
|
+
fe = next((f for f in job.files if f.file_id == file_id), None)
|
|
66
|
+
if fe is None:
|
|
67
|
+
raise HTTPException(404, "file not found")
|
|
68
|
+
return job, fe
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _findings_view(fe) -> list[dict]:
|
|
72
|
+
"""Strip the heavy raw engine finding before sending to the browser."""
|
|
73
|
+
return [{k: v for k, v in f.items() if k != "raw"} for f in fe.findings]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ── Endpoints ────────────────────────────────────────────────────────────────
|
|
77
|
+
@router.get("/capabilities")
|
|
78
|
+
async def capabilities():
|
|
79
|
+
return {"folder_picker": folderpicker.available()}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@router.get("/pick-folder")
|
|
83
|
+
def pick_folder():
|
|
84
|
+
"""Open the native folder chooser on the server's machine (blocks until the
|
|
85
|
+
user picks or cancels). Returns {"path": <abs path>|null}."""
|
|
86
|
+
if not folderpicker.available():
|
|
87
|
+
raise HTTPException(404, "no native folder picker on this platform")
|
|
88
|
+
return {"path": folderpicker.pick_folder()}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@router.post("/preview")
|
|
92
|
+
async def preview(req: PreviewReq):
|
|
93
|
+
input_dir = _dir(req.input_dir, must_exist=True)
|
|
94
|
+
out = []
|
|
95
|
+
for p in engines.collect_files(input_dir):
|
|
96
|
+
out.append({
|
|
97
|
+
"name": str(p.relative_to(input_dir)),
|
|
98
|
+
"kind": engines.classify(p),
|
|
99
|
+
"ext": p.suffix.lower(),
|
|
100
|
+
})
|
|
101
|
+
return {"input_dir": str(input_dir), "files": out}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@router.post("/jobs")
|
|
105
|
+
async def start_job(req: JobReq):
|
|
106
|
+
if not req.input_dir.strip():
|
|
107
|
+
raise HTTPException(400, "Input directory is required")
|
|
108
|
+
if not req.output_dir.strip():
|
|
109
|
+
raise HTTPException(400, "Output directory is required")
|
|
110
|
+
input_dir = _dir(req.input_dir, must_exist=True)
|
|
111
|
+
output_dir = _dir(req.output_dir, must_exist=False)
|
|
112
|
+
if not engines.collect_files(input_dir):
|
|
113
|
+
raise HTTPException(400, "No supported files found in input directory")
|
|
114
|
+
job = jobs.create_job(input_dir, output_dir, req.settings)
|
|
115
|
+
return {"job_id": job.job_id}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@router.get("/jobs/{job_id}")
|
|
119
|
+
async def job_status(job_id: str):
|
|
120
|
+
job = _get_job(job_id)
|
|
121
|
+
return {
|
|
122
|
+
"job_id": job.job_id,
|
|
123
|
+
"state": job.state,
|
|
124
|
+
"error": job.error,
|
|
125
|
+
"output_dir": str(job.output_dir),
|
|
126
|
+
"files": [
|
|
127
|
+
{
|
|
128
|
+
"file_id": fe.file_id,
|
|
129
|
+
"name": fe.rel,
|
|
130
|
+
"kind": fe.kind,
|
|
131
|
+
"ext": fe.ext,
|
|
132
|
+
"state": fe.state,
|
|
133
|
+
"status": fe.status,
|
|
134
|
+
"error": fe.error,
|
|
135
|
+
"n_findings": len([f for f in fe.findings if not f.get("removed")]),
|
|
136
|
+
"viewable": fe.viewable,
|
|
137
|
+
"output_name": fe.output_name,
|
|
138
|
+
"progress": fe.progress,
|
|
139
|
+
}
|
|
140
|
+
for fe in job.files
|
|
141
|
+
],
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@router.delete("/jobs/{job_id}")
|
|
146
|
+
async def cancel_job(job_id: str):
|
|
147
|
+
"""Drop a job from the registry (Cancel run). Any in-flight detection thread
|
|
148
|
+
keeps running to completion on its own copy but its results are discarded."""
|
|
149
|
+
jobs.JOBS.pop(job_id, None)
|
|
150
|
+
log.info(f"run cancelled — job {job_id}")
|
|
151
|
+
return {"ok": True}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@router.get("/jobs/{job_id}/files/{file_id}")
|
|
155
|
+
async def file_detail(job_id: str, file_id: str):
|
|
156
|
+
_job, fe = _get_file(job_id, file_id)
|
|
157
|
+
return {
|
|
158
|
+
"file_id": fe.file_id,
|
|
159
|
+
"name": fe.rel,
|
|
160
|
+
"kind": fe.kind,
|
|
161
|
+
"ext": fe.ext,
|
|
162
|
+
"state": fe.state,
|
|
163
|
+
"viewable": fe.viewable,
|
|
164
|
+
"inline": fe.ext in engines.INLINE_VIEWABLE,
|
|
165
|
+
"output_name": fe.output_name,
|
|
166
|
+
"output_path": str(fe.output_path) if fe.output_path else None,
|
|
167
|
+
"findings": _findings_view(fe),
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@router.post("/jobs/{job_id}/files/{file_id}/apply")
|
|
172
|
+
def apply(job_id: str, file_id: str, req: ApplyReq):
|
|
173
|
+
job, fe = _get_file(job_id, file_id)
|
|
174
|
+
if fe.kind == "video":
|
|
175
|
+
raise HTTPException(400, "Video files are view-only")
|
|
176
|
+
if fe.state not in ("ready", "applied"):
|
|
177
|
+
raise HTTPException(409, f"file is not ready (state={fe.state})")
|
|
178
|
+
try:
|
|
179
|
+
jobs.apply_file(job, fe, set(req.removed), [a.model_dump() for a in req.added])
|
|
180
|
+
except Exception as exc:
|
|
181
|
+
fe.state = "error"
|
|
182
|
+
fe.error = str(exc)
|
|
183
|
+
log.warning(f"apply failed — {fe.rel}: {exc}")
|
|
184
|
+
log.debug("traceback", exc_info=True)
|
|
185
|
+
raise HTTPException(500, f"apply failed: {exc}")
|
|
186
|
+
return {"ok": True, "output_name": fe.output_name, "viewable": fe.viewable}
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@router.get("/jobs/{job_id}/files/{file_id}/result")
|
|
190
|
+
def result(job_id: str, file_id: str):
|
|
191
|
+
_job, fe = _get_file(job_id, file_id)
|
|
192
|
+
if not fe.output_path or not Path(fe.output_path).exists():
|
|
193
|
+
raise HTTPException(404, "no redacted result yet")
|
|
194
|
+
# Inline so the PDF/audio/video renders in the page instead of downloading
|
|
195
|
+
# (the file is already saved to the output dir; this endpoint is just a preview).
|
|
196
|
+
return FileResponse(
|
|
197
|
+
str(fe.output_path), filename=fe.output_name,
|
|
198
|
+
headers=_NO_STORE, content_disposition_type="inline",
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@router.get("/jobs/{job_id}/files/{file_id}/clip")
|
|
203
|
+
def clip(job_id: str, file_id: str, start: float, end: float):
|
|
204
|
+
_job, fe = _get_file(job_id, file_id)
|
|
205
|
+
if fe.kind != "audio":
|
|
206
|
+
raise HTTPException(400, "clips are only available for audio files")
|
|
207
|
+
fd, name = tempfile.mkstemp(suffix=".mp3", prefix="nophi_clip_")
|
|
208
|
+
os.close(fd) # we only need the path; don't leak the descriptor
|
|
209
|
+
tmp = Path(name)
|
|
210
|
+
try:
|
|
211
|
+
clip_segment(fe.path, start, end, tmp)
|
|
212
|
+
except Exception:
|
|
213
|
+
tmp.unlink(missing_ok=True) # don't leave the temp file behind on failure
|
|
214
|
+
raise
|
|
215
|
+
return FileResponse(
|
|
216
|
+
str(tmp), media_type="audio/mpeg", headers=_NO_STORE,
|
|
217
|
+
background=BackgroundTask(lambda: tmp.unlink(missing_ok=True)),
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@router.get("/jobs/{job_id}/report")
|
|
222
|
+
def report(job_id: str, kind: str = "doc"):
|
|
223
|
+
job = _get_job(job_id)
|
|
224
|
+
name = "phi_report.xlsx" if kind == "doc" else "phi_av_report.xlsx"
|
|
225
|
+
path = job.output_dir / name
|
|
226
|
+
if not path.exists():
|
|
227
|
+
raise HTTPException(404, "report not generated yet")
|
|
228
|
+
return FileResponse(str(path), filename=name, headers=_NO_STORE)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# Routing + normalization glue between the web layer and the two redaction
|
|
2
|
+
# engines. NO redaction logic lives here — it dispatches to nophi's document
|
|
3
|
+
# handlers (detect_*/apply_*) and nophi_av's audio detect/apply, and converts the
|
|
4
|
+
# two finding schemas (document char offsets vs audio time intervals) into one
|
|
5
|
+
# shape the frontend renders.
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from nophi_av.pipeline import AUDIO_EXTENSIONS, VIDEO_EXTENSIONS
|
|
11
|
+
|
|
12
|
+
from nophi.handlers.text import detect_text, detect_csv, apply_text, apply_csv
|
|
13
|
+
from nophi.handlers.docx import detect_docx, apply_docx, write_docx
|
|
14
|
+
from nophi.handlers.xlsx import detect_xlsx, apply_xlsx, write_xlsx
|
|
15
|
+
from nophi.handlers.pdf import detect_pdf, apply_pdf, write_pdf
|
|
16
|
+
|
|
17
|
+
DOCUMENT_EXTENSIONS = {".txt", ".csv", ".docx", ".xlsx", ".pdf"}
|
|
18
|
+
SUPPORTED_EXTENSIONS = DOCUMENT_EXTENSIONS | AUDIO_EXTENSIONS | VIDEO_EXTENSIONS
|
|
19
|
+
|
|
20
|
+
# Formats the browser can render the redacted result for inline; the rest are
|
|
21
|
+
# download-only (docx, xlsx).
|
|
22
|
+
INLINE_VIEWABLE = {".pdf"} | AUDIO_EXTENSIONS | VIDEO_EXTENSIONS
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def classify(path: Path) -> str | None:
|
|
26
|
+
"""Return 'document' | 'audio' | 'video', or None if unsupported."""
|
|
27
|
+
ext = path.suffix.lower()
|
|
28
|
+
if ext in DOCUMENT_EXTENSIONS:
|
|
29
|
+
return "document"
|
|
30
|
+
if ext in AUDIO_EXTENSIONS:
|
|
31
|
+
return "audio"
|
|
32
|
+
if ext in VIDEO_EXTENSIONS:
|
|
33
|
+
return "video"
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def collect_files(input_dir: Path) -> list[Path]:
|
|
38
|
+
"""Supported files under input_dir (recursive), skipping hidden/junk paths."""
|
|
39
|
+
files: list[Path] = []
|
|
40
|
+
for f in sorted(input_dir.rglob("*")):
|
|
41
|
+
if (
|
|
42
|
+
f.is_file()
|
|
43
|
+
and not f.name.startswith(".")
|
|
44
|
+
and "__pycache__" not in f.parts
|
|
45
|
+
and ".git" not in f.parts
|
|
46
|
+
and f.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
47
|
+
):
|
|
48
|
+
files.append(f)
|
|
49
|
+
return files
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ── Document detect / apply dispatch ─────────────────────────────────────────
|
|
53
|
+
def detect_document(path: Path, analyzer, entities, exclude) -> list[dict]:
|
|
54
|
+
ext = path.suffix.lower()
|
|
55
|
+
if ext == ".txt":
|
|
56
|
+
return detect_text(path, analyzer, entities, exclude)
|
|
57
|
+
if ext == ".csv":
|
|
58
|
+
return detect_csv(path, analyzer, entities, exclude)
|
|
59
|
+
if ext == ".docx":
|
|
60
|
+
return detect_docx(path, analyzer, entities, exclude)
|
|
61
|
+
if ext == ".xlsx":
|
|
62
|
+
return detect_xlsx(path, analyzer, entities, exclude)
|
|
63
|
+
if ext == ".pdf":
|
|
64
|
+
return detect_pdf(path, analyzer, entities, exclude)
|
|
65
|
+
raise ValueError(f"Unsupported document type: {ext}")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def apply_document(path: Path, findings, output_path: Path, anonymizer, operators) -> None:
|
|
69
|
+
ext = path.suffix.lower()
|
|
70
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
if ext == ".txt":
|
|
72
|
+
output_path.write_text(apply_text(path, findings, anonymizer, operators), encoding="utf-8")
|
|
73
|
+
elif ext == ".csv":
|
|
74
|
+
output_path.write_text(apply_csv(path, findings, anonymizer, operators), encoding="utf-8")
|
|
75
|
+
elif ext == ".docx":
|
|
76
|
+
write_docx(apply_docx(path, findings, anonymizer, operators), output_path)
|
|
77
|
+
elif ext == ".xlsx":
|
|
78
|
+
write_xlsx(apply_xlsx(path, findings, anonymizer, operators), output_path)
|
|
79
|
+
elif ext == ".pdf":
|
|
80
|
+
write_pdf(apply_pdf(path, findings, operators), output_path)
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"Unsupported document type: {ext}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ── Finding normalization for the frontend ───────────────────────────────────
|
|
86
|
+
def _position_label(unit: dict) -> str:
|
|
87
|
+
"""Human-readable location of a document finding for the review table."""
|
|
88
|
+
kind = unit.get("kind")
|
|
89
|
+
if kind == "text":
|
|
90
|
+
return "text"
|
|
91
|
+
if kind == "csv":
|
|
92
|
+
return f"row {unit['row'] + 1}, col {unit['col'] + 1}"
|
|
93
|
+
if kind == "docx_para":
|
|
94
|
+
return f"paragraph {unit['index'] + 1}"
|
|
95
|
+
if kind == "xlsx_cell":
|
|
96
|
+
return f"{unit['sheet']}!{unit['coord']}"
|
|
97
|
+
if kind == "pdf_page":
|
|
98
|
+
return f"page {unit['page'] + 1}"
|
|
99
|
+
return ""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def normalize(finding: dict, kind: str, index: int) -> dict:
|
|
103
|
+
"""Project an engine finding into the shape the frontend table consumes.
|
|
104
|
+
|
|
105
|
+
The raw engine finding is kept verbatim under ``raw`` so apply can rebuild the
|
|
106
|
+
exact survivor list; the top-level keys are display-only.
|
|
107
|
+
"""
|
|
108
|
+
common = {
|
|
109
|
+
"fid": index,
|
|
110
|
+
"entity_type": finding.get("entity_type"),
|
|
111
|
+
"original_text": finding.get("original_text"),
|
|
112
|
+
"replacement": finding.get("replacement"),
|
|
113
|
+
"context": finding.get("context", ""),
|
|
114
|
+
"removed": False,
|
|
115
|
+
"added": False,
|
|
116
|
+
"raw": finding,
|
|
117
|
+
}
|
|
118
|
+
if kind == "audio":
|
|
119
|
+
common["start_time"] = finding.get("start_time")
|
|
120
|
+
common["end_time"] = finding.get("end_time")
|
|
121
|
+
common["speaker"] = finding.get("speaker")
|
|
122
|
+
common["scrubbable"] = (
|
|
123
|
+
finding.get("start_time") is not None and finding.get("end_time") is not None
|
|
124
|
+
)
|
|
125
|
+
common["position"] = _fmt_interval(finding.get("start_time"), finding.get("end_time"))
|
|
126
|
+
else:
|
|
127
|
+
common["position"] = _position_label(finding.get("unit", {}))
|
|
128
|
+
return common
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _fmt_interval(start, end) -> str:
|
|
132
|
+
if start is None or end is None:
|
|
133
|
+
return "—"
|
|
134
|
+
return f"{float(start):.2f}s – {float(end):.2f}s"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Native folder chooser for the input/output fields. The server runs on the
|
|
2
|
+
# user's own machine, so it shells out to the OS picker and returns the chosen
|
|
3
|
+
# absolute path. We shell out (rather than use tkinter) so the dialog runs in its
|
|
4
|
+
# own process — no main-thread/display constraints on our side, safe to call from
|
|
5
|
+
# a worker thread.
|
|
6
|
+
#
|
|
7
|
+
# Supported: macOS (osascript) and Windows (PowerShell FolderBrowserDialog).
|
|
8
|
+
# Elsewhere the picker is unavailable and the UI falls back to the typed field.
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def available() -> bool:
|
|
17
|
+
return sys.platform in ("darwin", "win32")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def pick_folder() -> Optional[str]:
|
|
21
|
+
"""Open the native folder chooser; return the chosen path, or None if the
|
|
22
|
+
user cancelled (or the platform has no picker)."""
|
|
23
|
+
if sys.platform == "darwin":
|
|
24
|
+
return _pick_macos()
|
|
25
|
+
if sys.platform == "win32":
|
|
26
|
+
return _pick_windows()
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _pick_macos() -> Optional[str]:
|
|
31
|
+
script = 'POSIX path of (choose folder with prompt "Select a folder")'
|
|
32
|
+
proc = subprocess.run(["osascript", "-e", script], capture_output=True, text=True)
|
|
33
|
+
if proc.returncode != 0: # user cancelled -> "User canceled. (-128)"
|
|
34
|
+
return None
|
|
35
|
+
path = proc.stdout.strip()
|
|
36
|
+
if len(path) > 1:
|
|
37
|
+
path = path.rstrip("/") # drop osascript's trailing slash (keep root "/")
|
|
38
|
+
return path or None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _pick_windows() -> Optional[str]:
|
|
42
|
+
# -STA is required by FolderBrowserDialog (single-threaded apartment).
|
|
43
|
+
ps = (
|
|
44
|
+
"Add-Type -AssemblyName System.Windows.Forms; "
|
|
45
|
+
"$d = New-Object System.Windows.Forms.FolderBrowserDialog; "
|
|
46
|
+
"if ($d.ShowDialog() -eq [System.Windows.Forms.DialogResult]::OK) "
|
|
47
|
+
"{ Write-Output $d.SelectedPath }"
|
|
48
|
+
)
|
|
49
|
+
proc = subprocess.run(
|
|
50
|
+
["powershell", "-NoProfile", "-STA", "-Command", ps],
|
|
51
|
+
capture_output=True, text=True,
|
|
52
|
+
)
|
|
53
|
+
if proc.returncode != 0:
|
|
54
|
+
return None
|
|
55
|
+
return proc.stdout.strip() or None
|