soaking 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- soaking-0.3.5/PKG-INFO +230 -0
- soaking-0.3.5/README.md +169 -0
- soaking-0.3.5/pyproject.toml +99 -0
- soaking-0.3.5/setup.cfg +4 -0
- soaking-0.3.5/soak/__init__.py +1 -0
- soaking-0.3.5/soak/agreement.py +321 -0
- soaking-0.3.5/soak/agreement_scripts.py +445 -0
- soaking-0.3.5/soak/cli.py +1876 -0
- soaking-0.3.5/soak/comparators/__init__.py +1 -0
- soaking-0.3.5/soak/comparators/base.py +51 -0
- soaking-0.3.5/soak/comparators/similarity_comparator.py +2357 -0
- soaking-0.3.5/soak/coverage/__init__.py +13 -0
- soaking-0.3.5/soak/coverage/analyzer.py +1309 -0
- soaking-0.3.5/soak/coverage/models.py +99 -0
- soaking-0.3.5/soak/document_utils.py +362 -0
- soaking-0.3.5/soak/error_handlers.py +420 -0
- soaking-0.3.5/soak/export_utils.py +203 -0
- soaking-0.3.5/soak/ground_truth_metrics.py +636 -0
- soaking-0.3.5/soak/helpers.py +261 -0
- soaking-0.3.5/soak/models/__init__.py +85 -0
- soaking-0.3.5/soak/models/alignment.py +350 -0
- soaking-0.3.5/soak/models/base.py +683 -0
- soaking-0.3.5/soak/models/cost_tracker.py +72 -0
- soaking-0.3.5/soak/models/dag.py +1013 -0
- soaking-0.3.5/soak/models/nodes/__init__.py +40 -0
- soaking-0.3.5/soak/models/nodes/base.py +1052 -0
- soaking-0.3.5/soak/models/nodes/batch.py +122 -0
- soaking-0.3.5/soak/models/nodes/classifier.py +632 -0
- soaking-0.3.5/soak/models/nodes/cluster.py +642 -0
- soaking-0.3.5/soak/models/nodes/filter.py +497 -0
- soaking-0.3.5/soak/models/nodes/groupby.py +235 -0
- soaking-0.3.5/soak/models/nodes/map.py +262 -0
- soaking-0.3.5/soak/models/nodes/reduce.py +304 -0
- soaking-0.3.5/soak/models/nodes/scrub.py +724 -0
- soaking-0.3.5/soak/models/nodes/transform.py +337 -0
- soaking-0.3.5/soak/models/nodes/ungroup.py +48 -0
- soaking-0.3.5/soak/models/nodes/verify.py +1507 -0
- soaking-0.3.5/soak/models/pipeline.py +227 -0
- soaking-0.3.5/soak/models/progress.py +79 -0
- soaking-0.3.5/soak/models/text_utils.py +240 -0
- soaking-0.3.5/soak/models/utils.py +714 -0
- soaking-0.3.5/soak/pipelines/check_quote_fair_use.sd +62 -0
- soaking-0.3.5/soak/pipelines/classifier.soak +127 -0
- soaking-0.3.5/soak/pipelines/classifier_tabular.soak +99 -0
- soaking-0.3.5/soak/pipelines/consolidate_codes.sd +58 -0
- soaking-0.3.5/soak/pipelines/demo.soak +91 -0
- soaking-0.3.5/soak/pipelines/filter_chunks.sd +16 -0
- soaking-0.3.5/soak/pipelines/final_themes.sd +184 -0
- soaking-0.3.5/soak/pipelines/initial_codes.sd +62 -0
- soaking-0.3.5/soak/pipelines/make_theme_group.sd +85 -0
- soaking-0.3.5/soak/pipelines/meta_narrative.sd +52 -0
- soaking-0.3.5/soak/pipelines/narrative_report.sd +84 -0
- soaking-0.3.5/soak/pipelines/pre_extract_relevant.sd +36 -0
- soaking-0.3.5/soak/pipelines/test.soak +60 -0
- soaking-0.3.5/soak/pipelines/verify.soak +34 -0
- soaking-0.3.5/soak/pipelines/zs.soak +99 -0
- soaking-0.3.5/soak/soak-data/annomi.csv +13553 -0
- soaking-0.3.5/soak/soak-data/annomi_with_ctx_train.csv +2654 -0
- soaking-0.3.5/soak/soak-data/cfs/README.txt +2 -0
- soaking-0.3.5/soak/soak-data/cfs/aGKbypa8fhI.txt +51 -0
- soaking-0.3.5/soak/soak-data/cfs/aOUUTEeIiS0.txt +61 -0
- soaking-0.3.5/soak/soak-data/cfs/aS8CQtc9DmA.txt +35 -0
- soaking-0.3.5/soak/soak-data/cfs/aTvSX_toNL4.txt +87 -0
- soaking-0.3.5/soak/soak-data/cfs/acWL9FBKr3o.txt +87 -0
- soaking-0.3.5/soak/soak-data/cfs/as7I55hY29k.txt +193 -0
- soaking-0.3.5/soak/soak-data/cfs/bRidO1PiZJs.txt +123 -0
- soaking-0.3.5/soak/soak-data/cfs/ba2LcetNybI.txt +151 -0
- soaking-0.3.5/soak/soak-data/cfsb/bRidO1PiZJs.txt +123 -0
- soaking-0.3.5/soak/soak-data/cfsb/ba2LcetNybI.txt +151 -0
- soaking-0.3.5/soak/soak-data/cfsb.zip +0 -0
- soaking-0.3.5/soak/soak-data/constitution.txt +5836 -0
- soaking-0.3.5/soak/soak-data/dream.txt +5 -0
- soaking-0.3.5/soak/soak-data/examplethemes.xlsx +0 -0
- soaking-0.3.5/soak/soak-data/interview_sample.txt +19 -0
- soaking-0.3.5/soak/soak-data/matthew-57.txt +76 -0
- soaking-0.3.5/soak/soak-data/pii.txt +39 -0
- soaking-0.3.5/soak/soak-data/test_data.csv +5 -0
- soaking-0.3.5/soak/soak-data/wheel.txt +110 -0
- soaking-0.3.5/soak/specs.py +165 -0
- soaking-0.3.5/soak/template_resolution.py +143 -0
- soaking-0.3.5/soak/template_validator.py +259 -0
- soaking-0.3.5/soak/templates/comparison.html +1017 -0
- soaking-0.3.5/soak/templates/coverage.html +485 -0
- soaking-0.3.5/soak/templates/does_quote_exist.sd +34 -0
- soaking-0.3.5/soak/templates/narrative.html +58 -0
- soaking-0.3.5/soak/templates/nodes/batch.html +10 -0
- soaking-0.3.5/soak/templates/nodes/classifier.html +31 -0
- soaking-0.3.5/soak/templates/nodes/cluster.html +24 -0
- soaking-0.3.5/soak/templates/nodes/default.html +16 -0
- soaking-0.3.5/soak/templates/nodes/filter.html +33 -0
- soaking-0.3.5/soak/templates/nodes/groupby.html +33 -0
- soaking-0.3.5/soak/templates/nodes/map.html +21 -0
- soaking-0.3.5/soak/templates/nodes/reduce.html +19 -0
- soaking-0.3.5/soak/templates/nodes/scrub.html +33 -0
- soaking-0.3.5/soak/templates/nodes/split.html +46 -0
- soaking-0.3.5/soak/templates/nodes/transform.html +44 -0
- soaking-0.3.5/soak/templates/nodes/transformreduce.html +34 -0
- soaking-0.3.5/soak/templates/nodes/ungroup.html +26 -0
- soaking-0.3.5/soak/templates/nodes/verifyquotes.html +34 -0
- soaking-0.3.5/soak/templates/pipeline.html +242 -0
- soaking-0.3.5/soak/templates/simple.html +277 -0
- soaking-0.3.5/soak/templates/summarise_quote_use.sd +50 -0
- soaking-0.3.5/soak/templates/verify_theme_quotes.sd +47 -0
- soaking-0.3.5/soak/utils/__init__.py +42 -0
- soaking-0.3.5/soak/visualization.py +151 -0
- soaking-0.3.5/soaking.egg-info/PKG-INFO +230 -0
- soaking-0.3.5/soaking.egg-info/SOURCES.txt +119 -0
- soaking-0.3.5/soaking.egg-info/dependency_links.txt +1 -0
- soaking-0.3.5/soaking.egg-info/entry_points.txt +2 -0
- soaking-0.3.5/soaking.egg-info/requires.txt +53 -0
- soaking-0.3.5/soaking.egg-info/top_level.txt +1 -0
- soaking-0.3.5/tests/test_agreement.py +21 -0
- soaking-0.3.5/tests/test_agreement_validation.py +231 -0
- soaking-0.3.5/tests/test_error_handling.py +227 -0
- soaking-0.3.5/tests/test_ground_truth.py +194 -0
- soaking-0.3.5/tests/test_integration_pipelines.py +242 -0
- soaking-0.3.5/tests/test_nodes.py +408 -0
- soaking-0.3.5/tests/test_overlap_exclusion.py +77 -0
- soaking-0.3.5/tests/test_spreadsheet_input.py +174 -0
- soaking-0.3.5/tests/test_template_resolution.py +242 -0
- soaking-0.3.5/tests/test_verify_comprehensive.py +225 -0
soaking-0.3.5/PKG-INFO
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: soaking
|
|
3
|
+
Version: 0.3.5
|
|
4
|
+
Summary: soak: graph-based pipelines and tools for LLM-assisted qualitative text analysis
|
|
5
|
+
Author-email: Ben Whalley <ben.whalley@plymouth.ac.uk>
|
|
6
|
+
License: AGPL-3.0-or-later
|
|
7
|
+
Requires-Python: <3.13,>=3.12
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: instructor>=1.10.0
|
|
10
|
+
Requires-Dist: jinja2>=3.1.6
|
|
11
|
+
Requires-Dist: lark>=1.2.2
|
|
12
|
+
Requires-Dist: matplotlib>=3.10.3
|
|
13
|
+
Requires-Dist: networkx>=3.5
|
|
14
|
+
Requires-Dist: pandas>=2.3.1
|
|
15
|
+
Requires-Dist: pdfplumber>=0.11.7
|
|
16
|
+
Requires-Dist: pydantic>=2.11.7
|
|
17
|
+
Requires-Dist: python-box>=7.3.2
|
|
18
|
+
Requires-Dist: python-decouple>=3.8
|
|
19
|
+
Requires-Dist: python-docx>=1.2.0
|
|
20
|
+
Requires-Dist: python-magic>=0.4.27
|
|
21
|
+
Requires-Dist: scikit-learn>=1.7.1
|
|
22
|
+
Requires-Dist: scipy>=1.14
|
|
23
|
+
Requires-Dist: seaborn>=0.13.2
|
|
24
|
+
Requires-Dist: tiktoken>=0.9.0
|
|
25
|
+
Requires-Dist: typer>=0.16.0
|
|
26
|
+
Requires-Dist: umap-learn
|
|
27
|
+
Requires-Dist: asyncpg>=0.30.0
|
|
28
|
+
Requires-Dist: jinja-markdown>=1.210911
|
|
29
|
+
Requires-Dist: struckdown
|
|
30
|
+
Requires-Dist: trogon>=0.6.0
|
|
31
|
+
Requires-Dist: nltk>=3.9.2
|
|
32
|
+
Requires-Dist: rank-bm25>=0.2.2
|
|
33
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
34
|
+
Requires-Dist: statsmodels>=0.14.0
|
|
35
|
+
Requires-Dist: krippendorff>=0.6.0
|
|
36
|
+
Requires-Dist: pyirr>=0.84.1.2
|
|
37
|
+
Requires-Dist: setuptools>=80.9.0
|
|
38
|
+
Requires-Dist: irrcac
|
|
39
|
+
Requires-Dist: pysbd>=0.3.4
|
|
40
|
+
Requires-Dist: tqdm>=4.67.0
|
|
41
|
+
Requires-Dist: simpleeval>=1.0.3
|
|
42
|
+
Requires-Dist: mkdocs>=1.6.0
|
|
43
|
+
Requires-Dist: mkdocs-material>=9.5.0
|
|
44
|
+
Requires-Dist: pymdown-extensions>=10.11.0
|
|
45
|
+
Requires-Dist: graphviz>=0.20.0
|
|
46
|
+
Requires-Dist: scrubadub>=2.0.1
|
|
47
|
+
Requires-Dist: spacy<3.9,>=3.8.4
|
|
48
|
+
Requires-Dist: spacy-transformers
|
|
49
|
+
Requires-Dist: scrubadub-spacy
|
|
50
|
+
Requires-Dist: transformers>=4.51.0
|
|
51
|
+
Requires-Dist: sentence-transformers>=2.5.1
|
|
52
|
+
Requires-Dist: pot>=0.9.6.post1
|
|
53
|
+
Requires-Dist: pyphen>=0.16.0
|
|
54
|
+
Requires-Dist: plotly>=5.18.0
|
|
55
|
+
Requires-Dist: tenacity>=8.2.0
|
|
56
|
+
Requires-Dist: hdbscan>=0.8.33
|
|
57
|
+
Provides-Extra: scrub
|
|
58
|
+
Requires-Dist: scrubadub>=2.0.0; extra == "scrub"
|
|
59
|
+
Requires-Dist: scrubadub-spacy>=2.0.0; extra == "scrub"
|
|
60
|
+
Requires-Dist: spacy<3.9,>=3.8.4; extra == "scrub"
|
|
61
|
+
|
|
62
|
+
# Get to saturation faster!
|
|
63
|
+
|
|
64
|
+
<img src="docs/logo-sm.png" width="15%">
|
|
65
|
+
|
|
66
|
+
**`soak` is a tool to enable qualitative researchers to rapidly define and run llm-assisted text analysis pipelines and thematic analysis.**
|
|
67
|
+
|
|
68
|
+
The easiest way to see what `soak` does is to see sample outputs from the system.
|
|
69
|
+
|
|
70
|
+
The Zero-shot pipeline diagram shows the various stages the analysis involves:
|
|
71
|
+
|
|
72
|
+

|
|
73
|
+
|
|
74
|
+
Input text from [patient interviews](soak/data/cfs/):
|
|
75
|
+
|
|
76
|
+

|
|
77
|
+
|
|
78
|
+
Sample theme extracted:
|
|
79
|
+
|
|
80
|
+

|
|
81
|
+
|
|
82
|
+
Matching LLM extracted quotes to source text to detect hallucinations:
|
|
83
|
+
|
|
84
|
+

|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
A classification prompt, extracting structured data from transcripts. The green element is the templated input. The blue elements like `[[this]]` indicate the LLM-completions. Prompts are written in [struckdown](https://github.com/benwhalley/struckdown), is a simple text-based format used to constrain the LLM output to a specific data type/structure.
|
|
88
|
+
|
|
89
|
+

|
|
90
|
+
|
|
91
|
+
Inter-rater agreement and ground truth validation statistics, calculated for structured data extracted from transcripts:
|
|
92
|
+
|
|
93
|
+

|
|
94
|
+
|
|
95
|
+
**Ground truth validation:** Classifier nodes can automatically validate LLM outputs against ground truth labels, calculating precision, recall, F1, and confusion matrices:
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
ground_truths:
|
|
99
|
+
reflection:
|
|
100
|
+
existing: reflection_exists # Ground truth column
|
|
101
|
+
mapping: {yes: 1, no: 0} # Map LLM outputs to GT values
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
See [Ground Truth Validation](docs/how-to/ground-truth-validation.md) for details.
|
|
105
|
+
|
|
106
|
+
Plots and similarity statistics quantify the similarity between sets of themes created by different analyses. For example we might compare different LLMs, different datasets (patients vs doctors) or different prompts (amending the research question posed to the LLM). The heatmap reveals common themes between different analyses or datasets:
|
|
107
|
+
|
|
108
|
+

|
|
109
|
+
|
|
110
|
+
Similarity statistics quantify the similarity between sets of themes created by different analyses.
|
|
111
|
+
|
|
112
|
+

|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
### Sample outouts
|
|
116
|
+
|
|
117
|
+
- [cfs1_simple.html](https://benwhalley.github.io/soak/samples/cfs1_simple.html) shows a thematic analysis of transcripts of 8 patients with ME/CFS or Long COVID.
|
|
118
|
+
|
|
119
|
+
- [cfs2_pipeline.html](https://benwhalley.github.io/soak/samples/cfs2_simple.html) shows the same analysis using a different LLM model, and in extended HTML format.
|
|
120
|
+
|
|
121
|
+
- [comparison.html](https://benwhalley.github.io/soak/samples/comparison.html) shows the comparison of these two analyses.
|
|
122
|
+
|
|
123
|
+
- [20251008_085446_5db6_pipeline.html](https://benwhalley.github.io/soak/samples/classifier/20251008_085446_5db6_pipeline.html) shows the result of a different pipeline extracting structured data from the transcripts (results are also available as json and csv).
|
|
124
|
+
|
|
125
|
+
### Example pipeline specifications
|
|
126
|
+
|
|
127
|
+
- [soak/pipelines/zs.soak](soak/pipelines/zs.soak) is the Zero-shot pipeline used in the sample outputs above.
|
|
128
|
+
|
|
129
|
+
- [classifier.soak](docs/samples/classifier/classifier.soak) is the classifier pipeline used in the sample output above.
|
|
130
|
+
|
|
131
|
+
## Quick Start
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# install
|
|
135
|
+
git clone https://github.com/benwhalley/soak
|
|
136
|
+
uv install . tool
|
|
137
|
+
|
|
138
|
+
# set credentials, using openai for simplicity
|
|
139
|
+
export LLM_API_KEY=your_api_key
|
|
140
|
+
export LLM_API_BASE=https://api.openai.com/v1
|
|
141
|
+
|
|
142
|
+
# Run analysis
|
|
143
|
+
soak zs soak/data/cfs/*.txt -t simple -o cfs-simple-1
|
|
144
|
+
|
|
145
|
+
# Open results in a browser
|
|
146
|
+
open cfs-simple-1_simple.html
|
|
147
|
+
|
|
148
|
+
# Re-run with a different/better model
|
|
149
|
+
soak zs -o cfs-simple-2 --model-name="openai/gpt-4o" soak/data/cfs/*.txt
|
|
150
|
+
|
|
151
|
+
# Compare results
|
|
152
|
+
soak compare cfs-simple-1.json cfs-simple-2.json -o comparison.html
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
## More usage
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# Basic pattern
|
|
160
|
+
uv run soak <pipeline> <files> --output <name>
|
|
161
|
+
|
|
162
|
+
# Run demo pipeline on sample text files
|
|
163
|
+
uv run soak demo --output demo_analysis soak/data/cfs/*.txt
|
|
164
|
+
|
|
165
|
+
# Use the 'simple' html output template
|
|
166
|
+
uv run soak zs -t simple --output analysis_simple soak/data/cfs/*.txt
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Working with CSV/XLSX Spreadsheets
|
|
170
|
+
|
|
171
|
+
CSV and XLSX files are fully supported. Each row becomes a separate document, with column values accessible in templates as `{{column_name}}`.
|
|
172
|
+
|
|
173
|
+
**Example data** (`soak/data/test_data.csv`):
|
|
174
|
+
```csv
|
|
175
|
+
participant_id,age,condition,response
|
|
176
|
+
P001,25,control,I felt very relaxed during the session
|
|
177
|
+
P002,32,treatment,The intervention helped me focus better
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
**Run classifier on CSV:**
|
|
181
|
+
```bash
|
|
182
|
+
uv run soak classifier_tabular --output csv_analysis soak/data/test_data.csv
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
**Pipeline template accessing columns:**
|
|
186
|
+
|
|
187
|
+
```yaml
|
|
188
|
+
# pipeline.soak
|
|
189
|
+
nodes:
|
|
190
|
+
- name: analyze
|
|
191
|
+
type: Map
|
|
192
|
+
inputs: [documents]
|
|
193
|
+
---#analyze
|
|
194
|
+
Participant {{participant_id}} (age {{age}}, {{condition}} group):
|
|
195
|
+
{{response}}
|
|
196
|
+
|
|
197
|
+
Summarize the response: [[summary:str]]
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
**Sampling options:**
|
|
201
|
+
```bash
|
|
202
|
+
# Process first 10 rows only (useful for testing)
|
|
203
|
+
uv run soak classifier_tabular --head 10 --output test_run survey.csv
|
|
204
|
+
|
|
205
|
+
# Randomly sample 50 rows
|
|
206
|
+
uv run soak classifier_tabular --sample 50 --output pilot survey.csv
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
See [Working with Spreadsheet Data](docs/how-to/working-with-spreadsheet-data.md) for more details.
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
**Common Options:**
|
|
213
|
+
- `--output, -o`: Output filename (generates .json dump file and .html)
|
|
214
|
+
- `--model-name`: LLM model (default: gpt-4o-mini)
|
|
215
|
+
- `-c, --context`: Pipeline context variables (e.g., `-c research_question="Experiences of patients with COVID-19"`)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
## Documentation
|
|
219
|
+
|
|
220
|
+
- [Docs index](docs/index.md)
|
|
221
|
+
- [Getting started](docs/tutorials/getting-started.md)
|
|
222
|
+
|
|
223
|
+
See [CLAUDE.md](CLAUDE.md) for architecture details.
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
## License
|
|
227
|
+
|
|
228
|
+
AGPL v3 or later
|
|
229
|
+
|
|
230
|
+
Please cite: Ben Whalley. (2025). benwhalley/soak: Initial public release (v0.3.0). Zenodo. https://doi.org/10.5281/zenodo.17293023
|
soaking-0.3.5/README.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# Get to saturation faster!
|
|
2
|
+
|
|
3
|
+
<img src="docs/logo-sm.png" width="15%">
|
|
4
|
+
|
|
5
|
+
**`soak` is a tool to enable qualitative researchers to rapidly define and run llm-assisted text analysis pipelines and thematic analysis.**
|
|
6
|
+
|
|
7
|
+
The easiest way to see what `soak` does is to see sample outputs from the system.
|
|
8
|
+
|
|
9
|
+
The Zero-shot pipeline diagram shows the various stages the analysis involves:
|
|
10
|
+
|
|
11
|
+

|
|
12
|
+
|
|
13
|
+
Input text from [patient interviews](soak/data/cfs/):
|
|
14
|
+
|
|
15
|
+

|
|
16
|
+
|
|
17
|
+
Sample theme extracted:
|
|
18
|
+
|
|
19
|
+

|
|
20
|
+
|
|
21
|
+
Matching LLM extracted quotes to source text to detect hallucinations:
|
|
22
|
+
|
|
23
|
+

|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
A classification prompt, extracting structured data from transcripts. The green element is the templated input. The blue elements like `[[this]]` indicate the LLM-completions. Prompts are written in [struckdown](https://github.com/benwhalley/struckdown), is a simple text-based format used to constrain the LLM output to a specific data type/structure.
|
|
27
|
+
|
|
28
|
+

|
|
29
|
+
|
|
30
|
+
Inter-rater agreement and ground truth validation statistics, calculated for structured data extracted from transcripts:
|
|
31
|
+
|
|
32
|
+

|
|
33
|
+
|
|
34
|
+
**Ground truth validation:** Classifier nodes can automatically validate LLM outputs against ground truth labels, calculating precision, recall, F1, and confusion matrices:
|
|
35
|
+
|
|
36
|
+
```yaml
|
|
37
|
+
ground_truths:
|
|
38
|
+
reflection:
|
|
39
|
+
existing: reflection_exists # Ground truth column
|
|
40
|
+
mapping: {yes: 1, no: 0} # Map LLM outputs to GT values
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
See [Ground Truth Validation](docs/how-to/ground-truth-validation.md) for details.
|
|
44
|
+
|
|
45
|
+
Plots and similarity statistics quantify the similarity between sets of themes created by different analyses. For example we might compare different LLMs, different datasets (patients vs doctors) or different prompts (amending the research question posed to the LLM). The heatmap reveals common themes between different analyses or datasets:
|
|
46
|
+
|
|
47
|
+

|
|
48
|
+
|
|
49
|
+
Similarity statistics quantify the similarity between sets of themes created by different analyses.
|
|
50
|
+
|
|
51
|
+

|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
### Sample outouts
|
|
55
|
+
|
|
56
|
+
- [cfs1_simple.html](https://benwhalley.github.io/soak/samples/cfs1_simple.html) shows a thematic analysis of transcripts of 8 patients with ME/CFS or Long COVID.
|
|
57
|
+
|
|
58
|
+
- [cfs2_pipeline.html](https://benwhalley.github.io/soak/samples/cfs2_simple.html) shows the same analysis using a different LLM model, and in extended HTML format.
|
|
59
|
+
|
|
60
|
+
- [comparison.html](https://benwhalley.github.io/soak/samples/comparison.html) shows the comparison of these two analyses.
|
|
61
|
+
|
|
62
|
+
- [20251008_085446_5db6_pipeline.html](https://benwhalley.github.io/soak/samples/classifier/20251008_085446_5db6_pipeline.html) shows the result of a different pipeline extracting structured data from the transcripts (results are also available as json and csv).
|
|
63
|
+
|
|
64
|
+
### Example pipeline specifications
|
|
65
|
+
|
|
66
|
+
- [soak/pipelines/zs.soak](soak/pipelines/zs.soak) is the Zero-shot pipeline used in the sample outputs above.
|
|
67
|
+
|
|
68
|
+
- [classifier.soak](docs/samples/classifier/classifier.soak) is the classifier pipeline used in the sample output above.
|
|
69
|
+
|
|
70
|
+
## Quick Start
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# install
|
|
74
|
+
git clone https://github.com/benwhalley/soak
|
|
75
|
+
uv install . tool
|
|
76
|
+
|
|
77
|
+
# set credentials, using openai for simplicity
|
|
78
|
+
export LLM_API_KEY=your_api_key
|
|
79
|
+
export LLM_API_BASE=https://api.openai.com/v1
|
|
80
|
+
|
|
81
|
+
# Run analysis
|
|
82
|
+
soak zs soak/data/cfs/*.txt -t simple -o cfs-simple-1
|
|
83
|
+
|
|
84
|
+
# Open results in a browser
|
|
85
|
+
open cfs-simple-1_simple.html
|
|
86
|
+
|
|
87
|
+
# Re-run with a different/better model
|
|
88
|
+
soak zs -o cfs-simple-2 --model-name="openai/gpt-4o" soak/data/cfs/*.txt
|
|
89
|
+
|
|
90
|
+
# Compare results
|
|
91
|
+
soak compare cfs-simple-1.json cfs-simple-2.json -o comparison.html
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
## More usage
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# Basic pattern
|
|
99
|
+
uv run soak <pipeline> <files> --output <name>
|
|
100
|
+
|
|
101
|
+
# Run demo pipeline on sample text files
|
|
102
|
+
uv run soak demo --output demo_analysis soak/data/cfs/*.txt
|
|
103
|
+
|
|
104
|
+
# Use the 'simple' html output template
|
|
105
|
+
uv run soak zs -t simple --output analysis_simple soak/data/cfs/*.txt
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Working with CSV/XLSX Spreadsheets
|
|
109
|
+
|
|
110
|
+
CSV and XLSX files are fully supported. Each row becomes a separate document, with column values accessible in templates as `{{column_name}}`.
|
|
111
|
+
|
|
112
|
+
**Example data** (`soak/data/test_data.csv`):
|
|
113
|
+
```csv
|
|
114
|
+
participant_id,age,condition,response
|
|
115
|
+
P001,25,control,I felt very relaxed during the session
|
|
116
|
+
P002,32,treatment,The intervention helped me focus better
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Run classifier on CSV:**
|
|
120
|
+
```bash
|
|
121
|
+
uv run soak classifier_tabular --output csv_analysis soak/data/test_data.csv
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Pipeline template accessing columns:**
|
|
125
|
+
|
|
126
|
+
```yaml
|
|
127
|
+
# pipeline.soak
|
|
128
|
+
nodes:
|
|
129
|
+
- name: analyze
|
|
130
|
+
type: Map
|
|
131
|
+
inputs: [documents]
|
|
132
|
+
---#analyze
|
|
133
|
+
Participant {{participant_id}} (age {{age}}, {{condition}} group):
|
|
134
|
+
{{response}}
|
|
135
|
+
|
|
136
|
+
Summarize the response: [[summary:str]]
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
**Sampling options:**
|
|
140
|
+
```bash
|
|
141
|
+
# Process first 10 rows only (useful for testing)
|
|
142
|
+
uv run soak classifier_tabular --head 10 --output test_run survey.csv
|
|
143
|
+
|
|
144
|
+
# Randomly sample 50 rows
|
|
145
|
+
uv run soak classifier_tabular --sample 50 --output pilot survey.csv
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
See [Working with Spreadsheet Data](docs/how-to/working-with-spreadsheet-data.md) for more details.
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
**Common Options:**
|
|
152
|
+
- `--output, -o`: Output filename (generates .json dump file and .html)
|
|
153
|
+
- `--model-name`: LLM model (default: gpt-4o-mini)
|
|
154
|
+
- `-c, --context`: Pipeline context variables (e.g., `-c research_question="Experiences of patients with COVID-19"`)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
## Documentation
|
|
158
|
+
|
|
159
|
+
- [Docs index](docs/index.md)
|
|
160
|
+
- [Getting started](docs/tutorials/getting-started.md)
|
|
161
|
+
|
|
162
|
+
See [CLAUDE.md](CLAUDE.md) for architecture details.
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
## License
|
|
166
|
+
|
|
167
|
+
AGPL v3 or later
|
|
168
|
+
|
|
169
|
+
Please cite: Ben Whalley. (2025). benwhalley/soak: Initial public release (v0.3.0). Zenodo. https://doi.org/10.5281/zenodo.17293023
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "soaking"
|
|
7
|
+
version = "0.3.5"
|
|
8
|
+
description = "soak: graph-based pipelines and tools for LLM-assisted qualitative text analysis"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Ben Whalley", email = "ben.whalley@plymouth.ac.uk" }
|
|
11
|
+
]
|
|
12
|
+
license = { text = "AGPL-3.0-or-later" }
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
requires-python = ">=3.12,<3.13"
|
|
15
|
+
dependencies = [
|
|
16
|
+
"instructor>=1.10.0",
|
|
17
|
+
"jinja2>=3.1.6",
|
|
18
|
+
"lark>=1.2.2",
|
|
19
|
+
"matplotlib>=3.10.3",
|
|
20
|
+
"networkx>=3.5",
|
|
21
|
+
"pandas>=2.3.1",
|
|
22
|
+
"pdfplumber>=0.11.7",
|
|
23
|
+
"pydantic>=2.11.7",
|
|
24
|
+
"python-box>=7.3.2",
|
|
25
|
+
"python-decouple>=3.8",
|
|
26
|
+
"python-docx>=1.2.0",
|
|
27
|
+
"python-magic>=0.4.27",
|
|
28
|
+
"scikit-learn>=1.7.1",
|
|
29
|
+
"scipy>=1.14",
|
|
30
|
+
"seaborn>=0.13.2",
|
|
31
|
+
"tiktoken>=0.9.0",
|
|
32
|
+
"typer>=0.16.0",
|
|
33
|
+
"umap-learn",
|
|
34
|
+
"asyncpg>=0.30.0",
|
|
35
|
+
"jinja-markdown>=1.210911",
|
|
36
|
+
"struckdown",
|
|
37
|
+
"trogon>=0.6.0",
|
|
38
|
+
"nltk>=3.9.2",
|
|
39
|
+
"rank-bm25>=0.2.2",
|
|
40
|
+
"openpyxl>=3.1.0",
|
|
41
|
+
"statsmodels>=0.14.0",
|
|
42
|
+
"krippendorff>=0.6.0",
|
|
43
|
+
"pyirr>=0.84.1.2",
|
|
44
|
+
"setuptools>=80.9.0",
|
|
45
|
+
"irrcac",
|
|
46
|
+
"pysbd>=0.3.4",
|
|
47
|
+
"tqdm>=4.67.0",
|
|
48
|
+
"simpleeval>=1.0.3",
|
|
49
|
+
"mkdocs>=1.6.0",
|
|
50
|
+
"mkdocs-material>=9.5.0",
|
|
51
|
+
"pymdown-extensions>=10.11.0",
|
|
52
|
+
"graphviz>=0.20.0",
|
|
53
|
+
"scrubadub>=2.0.1",
|
|
54
|
+
"spacy>=3.8.4,<3.9",
|
|
55
|
+
"spacy-transformers",
|
|
56
|
+
"scrubadub-spacy",
|
|
57
|
+
"transformers>=4.51.0",
|
|
58
|
+
"sentence-transformers>=2.5.1",
|
|
59
|
+
"pot>=0.9.6.post1",
|
|
60
|
+
"pyphen>=0.16.0",
|
|
61
|
+
"plotly>=5.18.0",
|
|
62
|
+
"tenacity>=8.2.0",
|
|
63
|
+
"hdbscan>=0.8.33",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
[project.optional-dependencies]
|
|
67
|
+
scrub = [
|
|
68
|
+
"scrubadub>=2.0.0",
|
|
69
|
+
"scrubadub-spacy>=2.0.0",
|
|
70
|
+
"spacy>=3.8.4,<3.9",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
[tool.setuptools.package-data]
|
|
75
|
+
soak = ["templates/*.html", "templates/*.md", "templates/**/*.html", "templates/**/*.md", "templates/**/*.sd", "pipelines/*.soak", "pipelines/*.sd", "soak-data/**/*"]
|
|
76
|
+
|
|
77
|
+
[tool.setuptools.packages.find]
|
|
78
|
+
where = ["."]
|
|
79
|
+
include = ["soak*",]
|
|
80
|
+
|
|
81
|
+
[project.scripts]
|
|
82
|
+
soak = "soak.cli:main_with_default_command"
|
|
83
|
+
|
|
84
|
+
[tool.uv]
|
|
85
|
+
package = true
|
|
86
|
+
|
|
87
|
+
[tool.uv.sources]
|
|
88
|
+
# Local development version
|
|
89
|
+
# struckdown = { path = "/Users/benwhalley/dev/struckdown", editable = true }
|
|
90
|
+
struckdown = { git = "https://github.com/benwhalley/struckdown", branch = "main" }
|
|
91
|
+
irrcac = { git = "https://github.com/benwhalley/irrCAC.git" }
|
|
92
|
+
spacy-transformers = { git = "https://github.com/explosion/spacy-transformers.git", rev = "bf2fe03" }
|
|
93
|
+
umap-learn = { git = "https://github.com/lmcinnes/umap", branch = "master" }
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
[tool.pytest.ini_options]
|
|
97
|
+
pythonpath = ["."]
|
|
98
|
+
|
|
99
|
+
|
soaking-0.3.5/setup.cfg
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Automated qualitative analysis using language models."""
|