conversation-analyser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- conversation_analyser-0.1.0/PKG-INFO +139 -0
- conversation_analyser-0.1.0/README.md +116 -0
- conversation_analyser-0.1.0/pyproject.toml +40 -0
- conversation_analyser-0.1.0/src/conversation_analyser/__init__.py +16 -0
- conversation_analyser-0.1.0/src/conversation_analyser/analytics.py +127 -0
- conversation_analyser-0.1.0/src/conversation_analyser/api.py +62 -0
- conversation_analyser-0.1.0/src/conversation_analyser/cli.py +129 -0
- conversation_analyser-0.1.0/src/conversation_analyser/config.py +69 -0
- conversation_analyser-0.1.0/src/conversation_analyser/embeddings.py +41 -0
- conversation_analyser-0.1.0/src/conversation_analyser/llm.py +94 -0
- conversation_analyser-0.1.0/src/conversation_analyser/manifest.py +31 -0
- conversation_analyser-0.1.0/src/conversation_analyser/models.py +76 -0
- conversation_analyser-0.1.0/src/conversation_analyser/parsers/__init__.py +7 -0
- conversation_analyser-0.1.0/src/conversation_analyser/parsers/anythingllm.py +55 -0
- conversation_analyser-0.1.0/src/conversation_analyser/parsers/base.py +57 -0
- conversation_analyser-0.1.0/src/conversation_analyser/parsers/llm_segment.py +31 -0
- conversation_analyser-0.1.0/src/conversation_analyser/parsers/markers.py +61 -0
- conversation_analyser-0.1.0/src/conversation_analyser/parsers/registry.py +73 -0
- conversation_analyser-0.1.0/src/conversation_analyser/parsers/role_content.py +55 -0
- conversation_analyser-0.1.0/src/conversation_analyser/pipeline.py +228 -0
- conversation_analyser-0.1.0/src/conversation_analyser/scoring.py +47 -0
- conversation_analyser-0.1.0/src/conversation_analyser/taxonomy.py +208 -0
- conversation_analyser-0.1.0/tests/fixtures/anythingllm.json +4 -0
- conversation_analyser-0.1.0/tests/fixtures/role_content.json +9 -0
- conversation_analyser-0.1.0/tests/fixtures/transcript.txt +7 -0
- conversation_analyser-0.1.0/tests/test_analytics.py +40 -0
- conversation_analyser-0.1.0/tests/test_api.py +31 -0
- conversation_analyser-0.1.0/tests/test_cli_smoke.py +43 -0
- conversation_analyser-0.1.0/tests/test_manifest.py +31 -0
- conversation_analyser-0.1.0/tests/test_parsers.py +66 -0
- conversation_analyser-0.1.0/tests/test_pipeline.py +70 -0
- conversation_analyser-0.1.0/tests/test_scoring.py +33 -0
- conversation_analyser-0.1.0/tests/test_taxonomy.py +66 -0
- conversation_analyser-0.1.0/uv.lock +1932 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: conversation-analyser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Critical-thinking and analytics for human-AI conversations — a member of the lens analyser family.
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: fastapi>=0.109.0
|
|
7
|
+
Requires-Dist: pydantic>=2.0
|
|
8
|
+
Requires-Dist: pyspellchecker>=0.8
|
|
9
|
+
Requires-Dist: python-multipart>=0.0.9
|
|
10
|
+
Requires-Dist: rich>=13.7.0
|
|
11
|
+
Requires-Dist: textstat>=0.7
|
|
12
|
+
Requires-Dist: uvicorn[standard]>=0.27.0
|
|
13
|
+
Requires-Dist: vadersentiment>=3.3
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: httpx>=0.27.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
17
|
+
Provides-Extra: embeddings
|
|
18
|
+
Requires-Dist: numpy>=1.24; extra == 'embeddings'
|
|
19
|
+
Requires-Dist: sentence-transformers>=2.2; extra == 'embeddings'
|
|
20
|
+
Provides-Extra: llm
|
|
21
|
+
Requires-Dist: anthropic>=0.39; extra == 'llm'
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# conversation-analyser
|
|
25
|
+
|
|
26
|
+
Critical-thinking and analytics for human–AI conversations — a member of the
|
|
27
|
+
[`lens`](../) analyser family.
|
|
28
|
+
|
|
29
|
+
It scores a single conversation on two tiers:
|
|
30
|
+
|
|
31
|
+
1. **Analytics** (always on, offline): turn/word counts, prompt/response lengths,
|
|
32
|
+
question ratio, pushback hits, readability, sentiment trajectory, prompt
|
|
33
|
+
self-similarity, and temporal metrics when timestamps are present.
|
|
34
|
+
2. **Critical thinking** (opt-in, needs an LLM): classifies every human turn under
|
|
35
|
+
a 7-label prompt taxonomy, derives engagement ratios, an engagement **band**,
|
|
36
|
+
and a composite **0–100 critical-thinking score** with a component breakdown.
|
|
37
|
+
|
|
38
|
+
The taxonomy reuses the validated `NQ/FU/CH/EX/DG/AC/MT` scheme from the ISYS6020
|
|
39
|
+
marking pipeline (copied and forked). Design: `docs/superpowers/specs/2026-05-23-conversation-analyser-design.md`.
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install -e . # core: analytics + CLI + HTTP API
|
|
45
|
+
pip install -e '.[embeddings]' # + prompt self-similarity (sentence-transformers)
|
|
46
|
+
pip install -e '.[llm]' # + taxonomy/CT tier (anthropic)
|
|
47
|
+
pip install -e '.[embeddings,llm,dev]' # everything
|
|
48
|
+
export ANTHROPIC_API_KEY=... # required for the critical-thinking tier
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## CLI
|
|
52
|
+
|
|
53
|
+
Bare positional path to analyse (human summary by default, `--json` for machines);
|
|
54
|
+
`serve` subcommand for the HTTP API — same grammar as the rest of the family.
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
conversation-analyser transcript.txt # human summary, analytics only
|
|
58
|
+
conversation-analyser chat.json --json # full JSON to stdout
|
|
59
|
+
conversation-analyser chat.json --llm # add the critical-thinking tier
|
|
60
|
+
conversation-analyser log.json --idle-gap 45 # split sub-sessions on 45-min gaps
|
|
61
|
+
conversation-analyser raw.txt --parse-mode llm-segment --llm
|
|
62
|
+
conversation-analyser serve --port 8009 # run the HTTP API
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
The critical-thinking tier is **opt-in** (`--llm`) to avoid surprise API costs;
|
|
66
|
+
without it you get the analytics tier only.
|
|
67
|
+
|
|
68
|
+
## HTTP API
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
conversation-analyser serve --port 8009
|
|
72
|
+
curl -F file=@chat.json 'http://127.0.0.1:8009/analyse' # analytics only
|
|
73
|
+
curl -F file=@chat.json -F llm=true 'http://127.0.0.1:8009/analyse'
|
|
74
|
+
curl http://127.0.0.1:8009/health
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
`GET /health` and `POST /analyse` (multipart file upload, optional `llm` form
|
|
78
|
+
field) — the same `/analyse` contract auto-analyser routes to.
|
|
79
|
+
|
|
80
|
+
## Python API
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from conversation_analyser import ConversationAnalyser
|
|
84
|
+
|
|
85
|
+
result = ConversationAnalyser().analyse("transcript.txt", llm=True)
|
|
86
|
+
print(result.model_dump_json(indent=2))
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Input formats
|
|
90
|
+
|
|
91
|
+
A pluggable adapter registry tries, in order: structured adapters → heuristic
|
|
92
|
+
speaker markers → optional LLM segmentation → unsegmented fallback.
|
|
93
|
+
|
|
94
|
+
- **role/content** message list (OpenAI/Anthropic): `[{"role": "user", "content": "..."}, ...]`
|
|
95
|
+
- **AnythingLLM** rows: `[{"prompt": "...", "response": "...", "createdAt": ...}, ...]`
|
|
96
|
+
- **flat text** with speaker markers: `User:` / `Assistant:` / `Me:` / `ChatGPT:` /
|
|
97
|
+
`You said:` / `ChatGPT said:` / `Prompt:` / `Response:`
|
|
98
|
+
- anything else → LLM-segment (needs `[llm]`), else a single-blob fallback
|
|
99
|
+
|
|
100
|
+
`.pdf`/`.docx` inputs are text-extracted first (needs `pdfplumber`/`markitdown`,
|
|
101
|
+
or pre-extract with `document-analyser`).
|
|
102
|
+
|
|
103
|
+
## The taxonomy
|
|
104
|
+
|
|
105
|
+
| Code | Meaning |
|
|
106
|
+
|---|---|
|
|
107
|
+
| `NQ` | New Query — opens a new topic |
|
|
108
|
+
| `FU` | Follow-up — clarification/elaboration |
|
|
109
|
+
| `CH` | Challenge — pushes back, tests, asks why |
|
|
110
|
+
| `EX` | Extension — applies/compares/synthesises in a new direction |
|
|
111
|
+
| `DG` | Delegation — task hand-off, no engagement |
|
|
112
|
+
| `AC` | Acknowledgement — thanks/confirmation |
|
|
113
|
+
| `MT` | Meta — about the conversation itself |
|
|
114
|
+
|
|
115
|
+
`critical_thinking = (CH+EX)/turns`, `delegation = DG/turns`, `filler = (AC+MT)/turns`.
|
|
116
|
+
Bands: One-Shot · Delegator · Directed · Iterative · Critical.
|
|
117
|
+
|
|
118
|
+
## Graceful degradation
|
|
119
|
+
|
|
120
|
+
| Missing | Effect |
|
|
121
|
+
|---|---|
|
|
122
|
+
| `ANTHROPIC_API_KEY` / `[llm]` | `taxonomy`/`critical_thinking` null; analytics still produced; note `llm_unavailable` |
|
|
123
|
+
| `[embeddings]` | `prompt_self_similarity` null; note `embeddings_unavailable` |
|
|
124
|
+
| timestamps | temporal metrics omitted; no sub-session split; note `no timestamps` |
|
|
125
|
+
|
|
126
|
+
## Output
|
|
127
|
+
|
|
128
|
+
`ConversationAnalysis` → an `aggregate` (rolled up over all human turns, the
|
|
129
|
+
headline) plus one `SessionAnalysis` per idle-gap sub-session, each with
|
|
130
|
+
`analytics`, `taxonomy`, `critical_thinking`, and per-turn `turns` (label +
|
|
131
|
+
rationale + preview). See the design spec §8 for the full schema.
|
|
132
|
+
|
|
133
|
+
## Testing
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
pytest # fast, deterministic (LLM mocked, no network)
|
|
137
|
+
pytest -m slow # includes sentence-transformers model download
|
|
138
|
+
pytest -m integration # includes live LLM calls
|
|
139
|
+
```
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# conversation-analyser
|
|
2
|
+
|
|
3
|
+
Critical-thinking and analytics for human–AI conversations — a member of the
|
|
4
|
+
[`lens`](../) analyser family.
|
|
5
|
+
|
|
6
|
+
It scores a single conversation on two tiers:
|
|
7
|
+
|
|
8
|
+
1. **Analytics** (always on, offline): turn/word counts, prompt/response lengths,
|
|
9
|
+
question ratio, pushback hits, readability, sentiment trajectory, prompt
|
|
10
|
+
self-similarity, and temporal metrics when timestamps are present.
|
|
11
|
+
2. **Critical thinking** (opt-in, needs an LLM): classifies every human turn under
|
|
12
|
+
a 7-label prompt taxonomy, derives engagement ratios, an engagement **band**,
|
|
13
|
+
and a composite **0–100 critical-thinking score** with a component breakdown.
|
|
14
|
+
|
|
15
|
+
The taxonomy reuses the validated `NQ/FU/CH/EX/DG/AC/MT` scheme from the ISYS6020
|
|
16
|
+
marking pipeline (copied and forked). Design: `docs/superpowers/specs/2026-05-23-conversation-analyser-design.md`.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install -e . # core: analytics + CLI + HTTP API
|
|
22
|
+
pip install -e '.[embeddings]' # + prompt self-similarity (sentence-transformers)
|
|
23
|
+
pip install -e '.[llm]' # + taxonomy/CT tier (anthropic)
|
|
24
|
+
pip install -e '.[embeddings,llm,dev]' # everything
|
|
25
|
+
export ANTHROPIC_API_KEY=... # required for the critical-thinking tier
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## CLI
|
|
29
|
+
|
|
30
|
+
Bare positional path to analyse (human summary by default, `--json` for machines);
|
|
31
|
+
`serve` subcommand for the HTTP API — same grammar as the rest of the family.
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
conversation-analyser transcript.txt # human summary, analytics only
|
|
35
|
+
conversation-analyser chat.json --json # full JSON to stdout
|
|
36
|
+
conversation-analyser chat.json --llm # add the critical-thinking tier
|
|
37
|
+
conversation-analyser log.json --idle-gap 45 # split sub-sessions on 45-min gaps
|
|
38
|
+
conversation-analyser raw.txt --parse-mode llm-segment --llm
|
|
39
|
+
conversation-analyser serve --port 8009 # run the HTTP API
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
The critical-thinking tier is **opt-in** (`--llm`) to avoid surprise API costs;
|
|
43
|
+
without it you get the analytics tier only.
|
|
44
|
+
|
|
45
|
+
## HTTP API
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
conversation-analyser serve --port 8009
|
|
49
|
+
curl -F file=@chat.json 'http://127.0.0.1:8009/analyse' # analytics only
|
|
50
|
+
curl -F file=@chat.json -F llm=true 'http://127.0.0.1:8009/analyse'
|
|
51
|
+
curl http://127.0.0.1:8009/health
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
`GET /health` and `POST /analyse` (multipart file upload, optional `llm` form
|
|
55
|
+
field) — the same `/analyse` contract auto-analyser routes to.
|
|
56
|
+
|
|
57
|
+
## Python API
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from conversation_analyser import ConversationAnalyser
|
|
61
|
+
|
|
62
|
+
result = ConversationAnalyser().analyse("transcript.txt", llm=True)
|
|
63
|
+
print(result.model_dump_json(indent=2))
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Input formats
|
|
67
|
+
|
|
68
|
+
A pluggable adapter registry tries, in order: structured adapters → heuristic
|
|
69
|
+
speaker markers → optional LLM segmentation → unsegmented fallback.
|
|
70
|
+
|
|
71
|
+
- **role/content** message list (OpenAI/Anthropic): `[{"role": "user", "content": "..."}, ...]`
|
|
72
|
+
- **AnythingLLM** rows: `[{"prompt": "...", "response": "...", "createdAt": ...}, ...]`
|
|
73
|
+
- **flat text** with speaker markers: `User:` / `Assistant:` / `Me:` / `ChatGPT:` /
|
|
74
|
+
`You said:` / `ChatGPT said:` / `Prompt:` / `Response:`
|
|
75
|
+
- anything else → LLM-segment (needs `[llm]`), else a single-blob fallback
|
|
76
|
+
|
|
77
|
+
`.pdf`/`.docx` inputs are text-extracted first (needs `pdfplumber`/`markitdown`,
|
|
78
|
+
or pre-extract with `document-analyser`).
|
|
79
|
+
|
|
80
|
+
## The taxonomy
|
|
81
|
+
|
|
82
|
+
| Code | Meaning |
|
|
83
|
+
|---|---|
|
|
84
|
+
| `NQ` | New Query — opens a new topic |
|
|
85
|
+
| `FU` | Follow-up — clarification/elaboration |
|
|
86
|
+
| `CH` | Challenge — pushes back, tests, asks why |
|
|
87
|
+
| `EX` | Extension — applies/compares/synthesises in a new direction |
|
|
88
|
+
| `DG` | Delegation — task hand-off, no engagement |
|
|
89
|
+
| `AC` | Acknowledgement — thanks/confirmation |
|
|
90
|
+
| `MT` | Meta — about the conversation itself |
|
|
91
|
+
|
|
92
|
+
`critical_thinking = (CH+EX)/turns`, `delegation = DG/turns`, `filler = (AC+MT)/turns`.
|
|
93
|
+
Bands: One-Shot · Delegator · Directed · Iterative · Critical.
|
|
94
|
+
|
|
95
|
+
## Graceful degradation
|
|
96
|
+
|
|
97
|
+
| Missing | Effect |
|
|
98
|
+
|---|---|
|
|
99
|
+
| `ANTHROPIC_API_KEY` / `[llm]` | `taxonomy`/`critical_thinking` null; analytics still produced; note `llm_unavailable` |
|
|
100
|
+
| `[embeddings]` | `prompt_self_similarity` null; note `embeddings_unavailable` |
|
|
101
|
+
| timestamps | temporal metrics omitted; no sub-session split; note `no timestamps` |
|
|
102
|
+
|
|
103
|
+
## Output
|
|
104
|
+
|
|
105
|
+
`ConversationAnalysis` → an `aggregate` (rolled up over all human turns, the
|
|
106
|
+
headline) plus one `SessionAnalysis` per idle-gap sub-session, each with
|
|
107
|
+
`analytics`, `taxonomy`, `critical_thinking`, and per-turn `turns` (label +
|
|
108
|
+
rationale + preview). See the design spec §8 for the full schema.
|
|
109
|
+
|
|
110
|
+
## Testing
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
pytest # fast, deterministic (LLM mocked, no network)
|
|
114
|
+
pytest -m slow # includes sentence-transformers model download
|
|
115
|
+
pytest -m integration # includes live LLM calls
|
|
116
|
+
```
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "conversation-analyser"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Critical-thinking and analytics for human-AI conversations — a member of the lens analyser family."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pydantic>=2.0",
|
|
13
|
+
"textstat>=0.7",
|
|
14
|
+
"vaderSentiment>=3.3",
|
|
15
|
+
"pyspellchecker>=0.8",
|
|
16
|
+
"fastapi>=0.109.0",
|
|
17
|
+
"uvicorn[standard]>=0.27.0",
|
|
18
|
+
"python-multipart>=0.0.9",
|
|
19
|
+
"rich>=13.7.0",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
embeddings = ["sentence-transformers>=2.2", "numpy>=1.24"]
|
|
24
|
+
llm = ["anthropic>=0.39"]
|
|
25
|
+
dev = ["pytest>=8.0", "httpx>=0.27.0"]
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
conversation-analyser = "conversation_analyser.cli:main"
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.wheel]
|
|
31
|
+
packages = ["src/conversation_analyser"]
|
|
32
|
+
|
|
33
|
+
[tool.pytest.ini_options]
|
|
34
|
+
testpaths = ["tests"]
|
|
35
|
+
pythonpath = ["src"]
|
|
36
|
+
addopts = "-m 'not slow and not integration'"
|
|
37
|
+
markers = [
|
|
38
|
+
"slow: loads real models (sentence-transformers) or downloads weights — opt-in with -m slow",
|
|
39
|
+
"integration: makes live LLM calls — opt-in with -m integration",
|
|
40
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""conversation-analyser: critical-thinking + analytics for human-AI conversations.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
|
|
5
|
+
from conversation_analyser import ConversationAnalyser, ConversationAnalysis
|
|
6
|
+
|
|
7
|
+
result = ConversationAnalyser().analyse("transcript.txt")
|
|
8
|
+
print(result.model_dump_json(indent=2))
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .manifest import MANIFEST
|
|
13
|
+
from .models import ConversationAnalysis
|
|
14
|
+
from .pipeline import ConversationAnalyser
|
|
15
|
+
|
|
16
|
+
__all__ = ["ConversationAnalyser", "ConversationAnalysis", "MANIFEST"]
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Domain-neutral analytics tier (design spec §7).
|
|
2
|
+
|
|
3
|
+
All metrics derive deterministically from a session's turns. Optional/heavy
|
|
4
|
+
metrics (self-similarity) degrade to None when their dependency is absent; the
|
|
5
|
+
pipeline records a single note. Sentiment/readability/typo use the light core
|
|
6
|
+
deps and are wrapped defensively.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from statistics import fmean
|
|
13
|
+
|
|
14
|
+
from . import embeddings
|
|
15
|
+
from .models import AnalyticsMetrics
|
|
16
|
+
from .parsers.base import ParsedTurn
|
|
17
|
+
|
|
18
|
+
# Pushback cue regex, ported verbatim from marking_pipeline/transcript.py.
|
|
19
|
+
_PUSHBACK_RE = re.compile(
|
|
20
|
+
r"\b(no,|actually|but\b|wait\b|are you sure|that's wrong|incorrect|"
|
|
21
|
+
r"i disagree|not right|you're wrong|why\b|why is|why does)\b",
|
|
22
|
+
re.IGNORECASE,
|
|
23
|
+
)
|
|
24
|
+
_WORD_RE = re.compile(r"\w+")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _words(text: str) -> int:
|
|
28
|
+
return len(_WORD_RE.findall(text or ""))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def pushback_count(human_turns: list[ParsedTurn]) -> int:
|
|
32
|
+
return sum(len(_PUSHBACK_RE.findall(t.content)) for t in human_turns)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _question_ratio(prompts: list[str]) -> float:
|
|
36
|
+
if not prompts:
|
|
37
|
+
return 0.0
|
|
38
|
+
asked = sum(1 for p in prompts if "?" in p)
|
|
39
|
+
return round(asked / len(prompts), 2)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _flesch(text: str) -> float | None:
|
|
43
|
+
if not text.strip():
|
|
44
|
+
return None
|
|
45
|
+
try:
|
|
46
|
+
import textstat
|
|
47
|
+
|
|
48
|
+
return round(float(textstat.flesch_reading_ease(text)), 1)
|
|
49
|
+
except Exception:
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _typo_rate(prompts: list[str]) -> float | None:
|
|
54
|
+
try:
|
|
55
|
+
from spellchecker import SpellChecker
|
|
56
|
+
except Exception:
|
|
57
|
+
return None
|
|
58
|
+
checker = SpellChecker()
|
|
59
|
+
rates: list[float] = []
|
|
60
|
+
for p in prompts:
|
|
61
|
+
words = [w.lower() for w in _WORD_RE.findall(p) if w.isalpha()]
|
|
62
|
+
if not words:
|
|
63
|
+
continue
|
|
64
|
+
unknown = checker.unknown(words)
|
|
65
|
+
rates.append(len(unknown) / len(words))
|
|
66
|
+
return round(fmean(rates), 3) if rates else None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _sentiment(prompts: list[str]) -> tuple[float | None, float | None, float | None]:
|
|
70
|
+
if not prompts:
|
|
71
|
+
return None, None, None
|
|
72
|
+
try:
|
|
73
|
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
|
74
|
+
except Exception:
|
|
75
|
+
return None, None, None
|
|
76
|
+
analyzer = SentimentIntensityAnalyzer()
|
|
77
|
+
start = round(analyzer.polarity_scores(prompts[0])["compound"], 3)
|
|
78
|
+
end = round(analyzer.polarity_scores(prompts[-1])["compound"], 3)
|
|
79
|
+
return start, end, round(end - start, 3)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _temporal(turns: list[ParsedTurn]) -> tuple[float | None, int | None, int | None]:
|
|
83
|
+
stamps = [t.timestamp for t in turns if t.timestamp is not None]
|
|
84
|
+
if len(stamps) < 2:
|
|
85
|
+
return None, None, None
|
|
86
|
+
duration = (max(stamps) - min(stamps)).total_seconds() / 60.0
|
|
87
|
+
hour_mode = Counter(s.hour for s in stamps).most_common(1)[0][0]
|
|
88
|
+
weekday_mode = Counter(s.weekday() for s in stamps).most_common(1)[0][0]
|
|
89
|
+
return round(duration, 1), hour_mode, weekday_mode
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def compute_analytics(turns: list[ParsedTurn], *, with_embeddings: bool = True) -> AnalyticsMetrics:
|
|
93
|
+
human = [t for t in turns if t.role == "human"]
|
|
94
|
+
assistant = [t for t in turns if t.role == "assistant"]
|
|
95
|
+
prompts = [t.content for t in human]
|
|
96
|
+
responses = [t.content for t in assistant]
|
|
97
|
+
|
|
98
|
+
prompt_words = [_words(p) for p in prompts]
|
|
99
|
+
response_words = [_words(r) for r in responses]
|
|
100
|
+
|
|
101
|
+
similarity = None
|
|
102
|
+
if with_embeddings and embeddings.available():
|
|
103
|
+
similarity = embeddings.mean_self_similarity(prompts)
|
|
104
|
+
|
|
105
|
+
sent_start, sent_end, sent_delta = _sentiment(prompts)
|
|
106
|
+
duration_min, hour_mode, weekday_mode = _temporal(turns)
|
|
107
|
+
|
|
108
|
+
return AnalyticsMetrics(
|
|
109
|
+
turn_count=len(turns),
|
|
110
|
+
human_turn_count=len(human),
|
|
111
|
+
assistant_turn_count=len(assistant),
|
|
112
|
+
total_words=sum(prompt_words) + sum(response_words),
|
|
113
|
+
mean_prompt_len=round(fmean(prompt_words), 2) if prompt_words else 0.0,
|
|
114
|
+
max_prompt_len=max(prompt_words) if prompt_words else 0,
|
|
115
|
+
mean_response_len=round(fmean(response_words), 2) if response_words else 0.0,
|
|
116
|
+
question_ratio=_question_ratio(prompts),
|
|
117
|
+
pushback_count=pushback_count(human),
|
|
118
|
+
prompt_self_similarity=similarity,
|
|
119
|
+
flesch_reading_ease=_flesch("\n\n".join(prompts)),
|
|
120
|
+
mean_typo_rate=_typo_rate(prompts),
|
|
121
|
+
sentiment_start=sent_start,
|
|
122
|
+
sentiment_end=sent_end,
|
|
123
|
+
sentiment_delta=sent_delta,
|
|
124
|
+
duration_min=duration_min,
|
|
125
|
+
hour_of_day_mode=hour_mode,
|
|
126
|
+
weekday_mode=weekday_mode,
|
|
127
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""FastAPI app for conversation-analyser, mirroring the lens family contract.
|
|
2
|
+
|
|
3
|
+
Module-level `app` so the CLI can launch it with
|
|
4
|
+
`uvicorn.run("conversation_analyser.api:app", ...)` and tests can drive it with
|
|
5
|
+
fastapi.testclient.TestClient. Endpoints: GET /health, POST /analyse (file upload).
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import tempfile
|
|
10
|
+
import time
|
|
11
|
+
from importlib.metadata import version
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
|
15
|
+
|
|
16
|
+
from .manifest import MANIFEST
|
|
17
|
+
from .models import ConversationAnalysis
|
|
18
|
+
from .pipeline import ConversationAnalyser
|
|
19
|
+
|
|
20
|
+
_start_time = time.time()
|
|
21
|
+
|
|
22
|
+
app = FastAPI(title="conversation-analyser", version=version("conversation-analyser"))
|
|
23
|
+
|
|
24
|
+
_analyser = ConversationAnalyser()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@app.get("/health")
|
|
28
|
+
def health() -> dict:
|
|
29
|
+
return {
|
|
30
|
+
"status": "ok",
|
|
31
|
+
"uptime": round(time.time() - _start_time, 1),
|
|
32
|
+
"version": version("conversation-analyser"),
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@app.get("/manifest")
|
|
37
|
+
def manifest() -> dict:
|
|
38
|
+
return MANIFEST
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@app.post("/analyse", response_model=ConversationAnalysis)
|
|
42
|
+
async def analyse(
|
|
43
|
+
file: UploadFile = File(...),
|
|
44
|
+
llm: bool = Form(False),
|
|
45
|
+
) -> ConversationAnalysis:
|
|
46
|
+
content = await file.read()
|
|
47
|
+
if not content:
|
|
48
|
+
raise HTTPException(status_code=422, detail="Empty file")
|
|
49
|
+
|
|
50
|
+
suffix = Path(file.filename or "upload.txt").suffix or ".txt"
|
|
51
|
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
|
52
|
+
tmp_path = Path(tmp.name)
|
|
53
|
+
tmp.write(content)
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
return _analyser.analyse(tmp_path, llm=llm, input_label=file.filename or "<upload>")
|
|
57
|
+
except ValueError as e:
|
|
58
|
+
raise HTTPException(status_code=422, detail=str(e)) from e
|
|
59
|
+
except Exception as e: # noqa: BLE001
|
|
60
|
+
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
61
|
+
finally:
|
|
62
|
+
tmp_path.unlink(missing_ok=True)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""CLI entry point following the lens family pattern.
|
|
2
|
+
|
|
3
|
+
conversation-analyser <path> [--json] [--llm] [...] # analyse (default)
|
|
4
|
+
conversation-analyser serve [--host H] [--port P] # run the HTTP API
|
|
5
|
+
|
|
6
|
+
Human-readable summary by default; `--json` emits the full ConversationAnalysis
|
|
7
|
+
to stdout (this is what auto-analyser consumes). Diagnostics go to stderr.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from .config import DEFAULT_PORT, IDLE_GAP_MIN
|
|
18
|
+
from .manifest import MANIFEST
|
|
19
|
+
from .models import ConversationAnalysis
|
|
20
|
+
from .pipeline import ConversationAnalyser
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main() -> None:
|
|
24
|
+
if len(sys.argv) > 1 and sys.argv[1] == "serve":
|
|
25
|
+
_serve(sys.argv[2:])
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
if len(sys.argv) > 1 and sys.argv[1] == "manifest":
|
|
29
|
+
print(json.dumps(MANIFEST, indent=2))
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
parser = argparse.ArgumentParser(
|
|
33
|
+
prog="conversation-analyser",
|
|
34
|
+
description="Analyse a human-AI conversation: analytics + critical-thinking taxonomy.",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument("file", type=Path, help="conversation file (.json/.txt/.md/.pdf)")
|
|
37
|
+
parser.add_argument("--json", action="store_true", dest="as_json", help="JSON output")
|
|
38
|
+
parser.add_argument("--llm", action="store_true", help="add the taxonomy/critical-thinking tier (needs [llm] + ANTHROPIC_API_KEY)")
|
|
39
|
+
parser.add_argument("--no-embeddings", action="store_true", help="skip prompt self-similarity")
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"--parse-mode",
|
|
42
|
+
choices=("auto", "structured", "heuristic", "llm-segment"),
|
|
43
|
+
default="auto",
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument("--idle-gap", type=float, default=IDLE_GAP_MIN, help="sub-session split (minutes)")
|
|
46
|
+
args = parser.parse_args()
|
|
47
|
+
|
|
48
|
+
if not args.file.exists():
|
|
49
|
+
print(f"Error: file not found: {args.file}", file=sys.stderr)
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
result = ConversationAnalyser(idle_gap_min=args.idle_gap).analyse(
|
|
54
|
+
args.file,
|
|
55
|
+
llm=args.llm,
|
|
56
|
+
with_embeddings=not args.no_embeddings,
|
|
57
|
+
parse_mode=args.parse_mode,
|
|
58
|
+
)
|
|
59
|
+
except ValueError as e:
|
|
60
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
61
|
+
sys.exit(1)
|
|
62
|
+
except Exception as e: # noqa: BLE001
|
|
63
|
+
print(f"Analysis failed: {e}", file=sys.stderr)
|
|
64
|
+
sys.exit(1)
|
|
65
|
+
|
|
66
|
+
if args.as_json:
|
|
67
|
+
print(result.model_dump_json(indent=2))
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
_print_human(result)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _print_human(result: ConversationAnalysis) -> None:
|
|
74
|
+
from rich.console import Console
|
|
75
|
+
from rich.table import Table
|
|
76
|
+
|
|
77
|
+
console = Console(file=sys.stdout)
|
|
78
|
+
agg = result.aggregate
|
|
79
|
+
a = agg.analytics
|
|
80
|
+
|
|
81
|
+
console.print(
|
|
82
|
+
f"[bold]Input:[/bold] {result.input} "
|
|
83
|
+
f"[bold]Format:[/bold] {result.format_detected} ({result.parse_mode}) "
|
|
84
|
+
f"[bold]Sessions:[/bold] {result.session_count} "
|
|
85
|
+
f"[bold]LLM:[/bold] {'yes' if result.llm_used else 'no'}"
|
|
86
|
+
)
|
|
87
|
+
console.print(
|
|
88
|
+
f"[bold]Turns:[/bold] {a.turn_count} "
|
|
89
|
+
f"(human {a.human_turn_count}, ai {a.assistant_turn_count}) "
|
|
90
|
+
f"[bold]Words:[/bold] {a.total_words} "
|
|
91
|
+
f"[bold]Questions:[/bold] {a.question_ratio:.0%} "
|
|
92
|
+
f"[bold]Pushback:[/bold] {a.pushback_count}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if agg.critical_thinking is not None and agg.taxonomy is not None:
|
|
96
|
+
ct = agg.critical_thinking
|
|
97
|
+
console.print(
|
|
98
|
+
f"[bold]Critical-thinking score:[/bold] {ct.score:.0f}/100 "
|
|
99
|
+
f"[bold]Band:[/bold] {ct.band} "
|
|
100
|
+
f"[bold]Longest engaged chain:[/bold] {agg.taxonomy.longest_engaged_chain}"
|
|
101
|
+
)
|
|
102
|
+
table = Table(show_header=True, header_style="bold")
|
|
103
|
+
for code in agg.taxonomy.label_counts:
|
|
104
|
+
table.add_column(code, justify="right")
|
|
105
|
+
table.add_row(*[str(v) for v in agg.taxonomy.label_counts.values()])
|
|
106
|
+
console.print(table)
|
|
107
|
+
else:
|
|
108
|
+
console.print("[dim]Critical-thinking tier skipped (run with --llm).[/dim]")
|
|
109
|
+
|
|
110
|
+
if result.notes:
|
|
111
|
+
console.print(f"[dim]Notes: {', '.join(result.notes)}[/dim]")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _serve(argv: list[str]) -> None:
|
|
115
|
+
import uvicorn
|
|
116
|
+
|
|
117
|
+
parser = argparse.ArgumentParser(prog="conversation-analyser serve")
|
|
118
|
+
parser.add_argument(
|
|
119
|
+
"--port", type=int, default=int(os.getenv("CONVERSATION_ANALYSER_PORT", str(DEFAULT_PORT)))
|
|
120
|
+
)
|
|
121
|
+
parser.add_argument(
|
|
122
|
+
"--host", default=os.getenv("CONVERSATION_ANALYSER_HOST", "127.0.0.1")
|
|
123
|
+
)
|
|
124
|
+
args = parser.parse_args(argv)
|
|
125
|
+
uvicorn.run("conversation_analyser.api:app", host=args.host, port=args.port)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
if __name__ == "__main__":
|
|
129
|
+
main()
|