soaking 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. soaking-0.3.5/PKG-INFO +230 -0
  2. soaking-0.3.5/README.md +169 -0
  3. soaking-0.3.5/pyproject.toml +99 -0
  4. soaking-0.3.5/setup.cfg +4 -0
  5. soaking-0.3.5/soak/__init__.py +1 -0
  6. soaking-0.3.5/soak/agreement.py +321 -0
  7. soaking-0.3.5/soak/agreement_scripts.py +445 -0
  8. soaking-0.3.5/soak/cli.py +1876 -0
  9. soaking-0.3.5/soak/comparators/__init__.py +1 -0
  10. soaking-0.3.5/soak/comparators/base.py +51 -0
  11. soaking-0.3.5/soak/comparators/similarity_comparator.py +2357 -0
  12. soaking-0.3.5/soak/coverage/__init__.py +13 -0
  13. soaking-0.3.5/soak/coverage/analyzer.py +1309 -0
  14. soaking-0.3.5/soak/coverage/models.py +99 -0
  15. soaking-0.3.5/soak/document_utils.py +362 -0
  16. soaking-0.3.5/soak/error_handlers.py +420 -0
  17. soaking-0.3.5/soak/export_utils.py +203 -0
  18. soaking-0.3.5/soak/ground_truth_metrics.py +636 -0
  19. soaking-0.3.5/soak/helpers.py +261 -0
  20. soaking-0.3.5/soak/models/__init__.py +85 -0
  21. soaking-0.3.5/soak/models/alignment.py +350 -0
  22. soaking-0.3.5/soak/models/base.py +683 -0
  23. soaking-0.3.5/soak/models/cost_tracker.py +72 -0
  24. soaking-0.3.5/soak/models/dag.py +1013 -0
  25. soaking-0.3.5/soak/models/nodes/__init__.py +40 -0
  26. soaking-0.3.5/soak/models/nodes/base.py +1052 -0
  27. soaking-0.3.5/soak/models/nodes/batch.py +122 -0
  28. soaking-0.3.5/soak/models/nodes/classifier.py +632 -0
  29. soaking-0.3.5/soak/models/nodes/cluster.py +642 -0
  30. soaking-0.3.5/soak/models/nodes/filter.py +497 -0
  31. soaking-0.3.5/soak/models/nodes/groupby.py +235 -0
  32. soaking-0.3.5/soak/models/nodes/map.py +262 -0
  33. soaking-0.3.5/soak/models/nodes/reduce.py +304 -0
  34. soaking-0.3.5/soak/models/nodes/scrub.py +724 -0
  35. soaking-0.3.5/soak/models/nodes/transform.py +337 -0
  36. soaking-0.3.5/soak/models/nodes/ungroup.py +48 -0
  37. soaking-0.3.5/soak/models/nodes/verify.py +1507 -0
  38. soaking-0.3.5/soak/models/pipeline.py +227 -0
  39. soaking-0.3.5/soak/models/progress.py +79 -0
  40. soaking-0.3.5/soak/models/text_utils.py +240 -0
  41. soaking-0.3.5/soak/models/utils.py +714 -0
  42. soaking-0.3.5/soak/pipelines/check_quote_fair_use.sd +62 -0
  43. soaking-0.3.5/soak/pipelines/classifier.soak +127 -0
  44. soaking-0.3.5/soak/pipelines/classifier_tabular.soak +99 -0
  45. soaking-0.3.5/soak/pipelines/consolidate_codes.sd +58 -0
  46. soaking-0.3.5/soak/pipelines/demo.soak +91 -0
  47. soaking-0.3.5/soak/pipelines/filter_chunks.sd +16 -0
  48. soaking-0.3.5/soak/pipelines/final_themes.sd +184 -0
  49. soaking-0.3.5/soak/pipelines/initial_codes.sd +62 -0
  50. soaking-0.3.5/soak/pipelines/make_theme_group.sd +85 -0
  51. soaking-0.3.5/soak/pipelines/meta_narrative.sd +52 -0
  52. soaking-0.3.5/soak/pipelines/narrative_report.sd +84 -0
  53. soaking-0.3.5/soak/pipelines/pre_extract_relevant.sd +36 -0
  54. soaking-0.3.5/soak/pipelines/test.soak +60 -0
  55. soaking-0.3.5/soak/pipelines/verify.soak +34 -0
  56. soaking-0.3.5/soak/pipelines/zs.soak +99 -0
  57. soaking-0.3.5/soak/soak-data/annomi.csv +13553 -0
  58. soaking-0.3.5/soak/soak-data/annomi_with_ctx_train.csv +2654 -0
  59. soaking-0.3.5/soak/soak-data/cfs/README.txt +2 -0
  60. soaking-0.3.5/soak/soak-data/cfs/aGKbypa8fhI.txt +51 -0
  61. soaking-0.3.5/soak/soak-data/cfs/aOUUTEeIiS0.txt +61 -0
  62. soaking-0.3.5/soak/soak-data/cfs/aS8CQtc9DmA.txt +35 -0
  63. soaking-0.3.5/soak/soak-data/cfs/aTvSX_toNL4.txt +87 -0
  64. soaking-0.3.5/soak/soak-data/cfs/acWL9FBKr3o.txt +87 -0
  65. soaking-0.3.5/soak/soak-data/cfs/as7I55hY29k.txt +193 -0
  66. soaking-0.3.5/soak/soak-data/cfs/bRidO1PiZJs.txt +123 -0
  67. soaking-0.3.5/soak/soak-data/cfs/ba2LcetNybI.txt +151 -0
  68. soaking-0.3.5/soak/soak-data/cfsb/bRidO1PiZJs.txt +123 -0
  69. soaking-0.3.5/soak/soak-data/cfsb/ba2LcetNybI.txt +151 -0
  70. soaking-0.3.5/soak/soak-data/cfsb.zip +0 -0
  71. soaking-0.3.5/soak/soak-data/constitution.txt +5836 -0
  72. soaking-0.3.5/soak/soak-data/dream.txt +5 -0
  73. soaking-0.3.5/soak/soak-data/examplethemes.xlsx +0 -0
  74. soaking-0.3.5/soak/soak-data/interview_sample.txt +19 -0
  75. soaking-0.3.5/soak/soak-data/matthew-57.txt +76 -0
  76. soaking-0.3.5/soak/soak-data/pii.txt +39 -0
  77. soaking-0.3.5/soak/soak-data/test_data.csv +5 -0
  78. soaking-0.3.5/soak/soak-data/wheel.txt +110 -0
  79. soaking-0.3.5/soak/specs.py +165 -0
  80. soaking-0.3.5/soak/template_resolution.py +143 -0
  81. soaking-0.3.5/soak/template_validator.py +259 -0
  82. soaking-0.3.5/soak/templates/comparison.html +1017 -0
  83. soaking-0.3.5/soak/templates/coverage.html +485 -0
  84. soaking-0.3.5/soak/templates/does_quote_exist.sd +34 -0
  85. soaking-0.3.5/soak/templates/narrative.html +58 -0
  86. soaking-0.3.5/soak/templates/nodes/batch.html +10 -0
  87. soaking-0.3.5/soak/templates/nodes/classifier.html +31 -0
  88. soaking-0.3.5/soak/templates/nodes/cluster.html +24 -0
  89. soaking-0.3.5/soak/templates/nodes/default.html +16 -0
  90. soaking-0.3.5/soak/templates/nodes/filter.html +33 -0
  91. soaking-0.3.5/soak/templates/nodes/groupby.html +33 -0
  92. soaking-0.3.5/soak/templates/nodes/map.html +21 -0
  93. soaking-0.3.5/soak/templates/nodes/reduce.html +19 -0
  94. soaking-0.3.5/soak/templates/nodes/scrub.html +33 -0
  95. soaking-0.3.5/soak/templates/nodes/split.html +46 -0
  96. soaking-0.3.5/soak/templates/nodes/transform.html +44 -0
  97. soaking-0.3.5/soak/templates/nodes/transformreduce.html +34 -0
  98. soaking-0.3.5/soak/templates/nodes/ungroup.html +26 -0
  99. soaking-0.3.5/soak/templates/nodes/verifyquotes.html +34 -0
  100. soaking-0.3.5/soak/templates/pipeline.html +242 -0
  101. soaking-0.3.5/soak/templates/simple.html +277 -0
  102. soaking-0.3.5/soak/templates/summarise_quote_use.sd +50 -0
  103. soaking-0.3.5/soak/templates/verify_theme_quotes.sd +47 -0
  104. soaking-0.3.5/soak/utils/__init__.py +42 -0
  105. soaking-0.3.5/soak/visualization.py +151 -0
  106. soaking-0.3.5/soaking.egg-info/PKG-INFO +230 -0
  107. soaking-0.3.5/soaking.egg-info/SOURCES.txt +119 -0
  108. soaking-0.3.5/soaking.egg-info/dependency_links.txt +1 -0
  109. soaking-0.3.5/soaking.egg-info/entry_points.txt +2 -0
  110. soaking-0.3.5/soaking.egg-info/requires.txt +53 -0
  111. soaking-0.3.5/soaking.egg-info/top_level.txt +1 -0
  112. soaking-0.3.5/tests/test_agreement.py +21 -0
  113. soaking-0.3.5/tests/test_agreement_validation.py +231 -0
  114. soaking-0.3.5/tests/test_error_handling.py +227 -0
  115. soaking-0.3.5/tests/test_ground_truth.py +194 -0
  116. soaking-0.3.5/tests/test_integration_pipelines.py +242 -0
  117. soaking-0.3.5/tests/test_nodes.py +408 -0
  118. soaking-0.3.5/tests/test_overlap_exclusion.py +77 -0
  119. soaking-0.3.5/tests/test_spreadsheet_input.py +174 -0
  120. soaking-0.3.5/tests/test_template_resolution.py +242 -0
  121. soaking-0.3.5/tests/test_verify_comprehensive.py +225 -0
soaking-0.3.5/PKG-INFO ADDED
@@ -0,0 +1,230 @@
1
+ Metadata-Version: 2.4
2
+ Name: soaking
3
+ Version: 0.3.5
4
+ Summary: soak: graph-based pipelines and tools for LLM-assisted qualitative text analysis
5
+ Author-email: Ben Whalley <ben.whalley@plymouth.ac.uk>
6
+ License: AGPL-3.0-or-later
7
+ Requires-Python: <3.13,>=3.12
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: instructor>=1.10.0
10
+ Requires-Dist: jinja2>=3.1.6
11
+ Requires-Dist: lark>=1.2.2
12
+ Requires-Dist: matplotlib>=3.10.3
13
+ Requires-Dist: networkx>=3.5
14
+ Requires-Dist: pandas>=2.3.1
15
+ Requires-Dist: pdfplumber>=0.11.7
16
+ Requires-Dist: pydantic>=2.11.7
17
+ Requires-Dist: python-box>=7.3.2
18
+ Requires-Dist: python-decouple>=3.8
19
+ Requires-Dist: python-docx>=1.2.0
20
+ Requires-Dist: python-magic>=0.4.27
21
+ Requires-Dist: scikit-learn>=1.7.1
22
+ Requires-Dist: scipy>=1.14
23
+ Requires-Dist: seaborn>=0.13.2
24
+ Requires-Dist: tiktoken>=0.9.0
25
+ Requires-Dist: typer>=0.16.0
26
+ Requires-Dist: umap-learn
27
+ Requires-Dist: asyncpg>=0.30.0
28
+ Requires-Dist: jinja-markdown>=1.210911
29
+ Requires-Dist: struckdown
30
+ Requires-Dist: trogon>=0.6.0
31
+ Requires-Dist: nltk>=3.9.2
32
+ Requires-Dist: rank-bm25>=0.2.2
33
+ Requires-Dist: openpyxl>=3.1.0
34
+ Requires-Dist: statsmodels>=0.14.0
35
+ Requires-Dist: krippendorff>=0.6.0
36
+ Requires-Dist: pyirr>=0.84.1.2
37
+ Requires-Dist: setuptools>=80.9.0
38
+ Requires-Dist: irrcac
39
+ Requires-Dist: pysbd>=0.3.4
40
+ Requires-Dist: tqdm>=4.67.0
41
+ Requires-Dist: simpleeval>=1.0.3
42
+ Requires-Dist: mkdocs>=1.6.0
43
+ Requires-Dist: mkdocs-material>=9.5.0
44
+ Requires-Dist: pymdown-extensions>=10.11.0
45
+ Requires-Dist: graphviz>=0.20.0
46
+ Requires-Dist: scrubadub>=2.0.1
47
+ Requires-Dist: spacy<3.9,>=3.8.4
48
+ Requires-Dist: spacy-transformers
49
+ Requires-Dist: scrubadub-spacy
50
+ Requires-Dist: transformers>=4.51.0
51
+ Requires-Dist: sentence-transformers>=2.5.1
52
+ Requires-Dist: pot>=0.9.6.post1
53
+ Requires-Dist: pyphen>=0.16.0
54
+ Requires-Dist: plotly>=5.18.0
55
+ Requires-Dist: tenacity>=8.2.0
56
+ Requires-Dist: hdbscan>=0.8.33
57
+ Provides-Extra: scrub
58
+ Requires-Dist: scrubadub>=2.0.0; extra == "scrub"
59
+ Requires-Dist: scrubadub-spacy>=2.0.0; extra == "scrub"
60
+ Requires-Dist: spacy<3.9,>=3.8.4; extra == "scrub"
61
+
62
+ # Get to saturation faster!
63
+
64
+ <img src="docs/logo-sm.png" width="15%">
65
+
66
+ **`soak` is a tool to enable qualitative researchers to rapidly define and run llm-assisted text analysis pipelines and thematic analysis.**
67
+
68
+ The easiest way to see what `soak` does is to see sample outputs from the system.
69
+
70
+ The Zero-shot pipeline diagram shows the various stages the analysis involves:
71
+
72
+ ![an analysis pipeline](docs/images/zsmermaid.png)
73
+
74
+ Input text from [patient interviews](soak/data/cfs/):
75
+
76
+ ![raw data](docs/images/cfstext.png)
77
+
78
+ Sample theme extracted:
79
+
80
+ ![themes extracted](docs/images/theme.png)
81
+
82
+ Matching LLM extracted quotes to source text to detect hallucinations:
83
+
84
+ ![alt text](docs/images/quotematching.png)
85
+
86
+
87
+ A classification prompt, extracting structured data from transcripts. The green element is the templated input. The blue elements like `[[this]]` indicate the LLM-completions. Prompts are written in [struckdown](https://github.com/benwhalley/struckdown), is a simple text-based format used to constrain the LLM output to a specific data type/structure.
88
+
89
+ ![A struckdown prompt](docs/images/classifyprompt.png)
90
+
91
+ Inter-rater agreement and ground truth validation statistics, calculated for structured data extracted from transcripts:
92
+
93
+ ![IRR statistics](docs/images/rateragreement.png)
94
+
95
+ **Ground truth validation:** Classifier nodes can automatically validate LLM outputs against ground truth labels, calculating precision, recall, F1, and confusion matrices:
96
+
97
+ ```yaml
98
+ ground_truths:
99
+ reflection:
100
+ existing: reflection_exists # Ground truth column
101
+ mapping: {yes: 1, no: 0} # Map LLM outputs to GT values
102
+ ```
103
+
104
+ See [Ground Truth Validation](docs/how-to/ground-truth-validation.md) for details.
105
+
106
+ Plots and similarity statistics quantify the similarity between sets of themes created by different analyses. For example we might compare different LLMs, different datasets (patients vs doctors) or different prompts (amending the research question posed to the LLM). The heatmap reveals common themes between different analyses or datasets:
107
+
108
+ ![heatmap](docs/images/plot.png)
109
+
110
+ Similarity statistics quantify the similarity between sets of themes created by different analyses.
111
+
112
+ ![similarity statistics](docs/images/simstats.png)
113
+
114
+
115
+ ### Sample outouts
116
+
117
+ - [cfs1_simple.html](https://benwhalley.github.io/soak/samples/cfs1_simple.html) shows a thematic analysis of transcripts of 8 patients with ME/CFS or Long COVID.
118
+
119
+ - [cfs2_pipeline.html](https://benwhalley.github.io/soak/samples/cfs2_simple.html) shows the same analysis using a different LLM model, and in extended HTML format.
120
+
121
+ - [comparison.html](https://benwhalley.github.io/soak/samples/comparison.html) shows the comparison of these two analyses.
122
+
123
+ - [20251008_085446_5db6_pipeline.html](https://benwhalley.github.io/soak/samples/classifier/20251008_085446_5db6_pipeline.html) shows the result of a different pipeline extracting structured data from the transcripts (results are also available as json and csv).
124
+
125
+ ### Example pipeline specifications
126
+
127
+ - [soak/pipelines/zs.soak](soak/pipelines/zs.soak) is the Zero-shot pipeline used in the sample outputs above.
128
+
129
+ - [classifier.soak](docs/samples/classifier/classifier.soak) is the classifier pipeline used in the sample output above.
130
+
131
+ ## Quick Start
132
+
133
+ ```bash
134
+ # install
135
+ git clone https://github.com/benwhalley/soak
136
+ uv install . tool
137
+
138
+ # set credentials, using openai for simplicity
139
+ export LLM_API_KEY=your_api_key
140
+ export LLM_API_BASE=https://api.openai.com/v1
141
+
142
+ # Run analysis
143
+ soak zs soak/data/cfs/*.txt -t simple -o cfs-simple-1
144
+
145
+ # Open results in a browser
146
+ open cfs-simple-1_simple.html
147
+
148
+ # Re-run with a different/better model
149
+ soak zs -o cfs-simple-2 --model-name="openai/gpt-4o" soak/data/cfs/*.txt
150
+
151
+ # Compare results
152
+ soak compare cfs-simple-1.json cfs-simple-2.json -o comparison.html
153
+ ```
154
+
155
+
156
+ ## More usage
157
+
158
+ ```bash
159
+ # Basic pattern
160
+ uv run soak <pipeline> <files> --output <name>
161
+
162
+ # Run demo pipeline on sample text files
163
+ uv run soak demo --output demo_analysis soak/data/cfs/*.txt
164
+
165
+ # Use the 'simple' html output template
166
+ uv run soak zs -t simple --output analysis_simple soak/data/cfs/*.txt
167
+ ```
168
+
169
+ ### Working with CSV/XLSX Spreadsheets
170
+
171
+ CSV and XLSX files are fully supported. Each row becomes a separate document, with column values accessible in templates as `{{column_name}}`.
172
+
173
+ **Example data** (`soak/data/test_data.csv`):
174
+ ```csv
175
+ participant_id,age,condition,response
176
+ P001,25,control,I felt very relaxed during the session
177
+ P002,32,treatment,The intervention helped me focus better
178
+ ```
179
+
180
+ **Run classifier on CSV:**
181
+ ```bash
182
+ uv run soak classifier_tabular --output csv_analysis soak/data/test_data.csv
183
+ ```
184
+
185
+ **Pipeline template accessing columns:**
186
+
187
+ ```yaml
188
+ # pipeline.soak
189
+ nodes:
190
+ - name: analyze
191
+ type: Map
192
+ inputs: [documents]
193
+ ---#analyze
194
+ Participant {{participant_id}} (age {{age}}, {{condition}} group):
195
+ {{response}}
196
+
197
+ Summarize the response: [[summary:str]]
198
+ ```
199
+
200
+ **Sampling options:**
201
+ ```bash
202
+ # Process first 10 rows only (useful for testing)
203
+ uv run soak classifier_tabular --head 10 --output test_run survey.csv
204
+
205
+ # Randomly sample 50 rows
206
+ uv run soak classifier_tabular --sample 50 --output pilot survey.csv
207
+ ```
208
+
209
+ See [Working with Spreadsheet Data](docs/how-to/working-with-spreadsheet-data.md) for more details.
210
+
211
+
212
+ **Common Options:**
213
+ - `--output, -o`: Output filename (generates .json dump file and .html)
214
+ - `--model-name`: LLM model (default: gpt-4o-mini)
215
+ - `-c, --context`: Pipeline context variables (e.g., `-c research_question="Experiences of patients with COVID-19"`)
216
+
217
+
218
+ ## Documentation
219
+
220
+ - [Docs index](docs/index.md)
221
+ - [Getting started](docs/tutorials/getting-started.md)
222
+
223
+ See [CLAUDE.md](CLAUDE.md) for architecture details.
224
+
225
+
226
+ ## License
227
+
228
+ AGPL v3 or later
229
+
230
+ Please cite: Ben Whalley. (2025). benwhalley/soak: Initial public release (v0.3.0). Zenodo. https://doi.org/10.5281/zenodo.17293023
@@ -0,0 +1,169 @@
1
+ # Get to saturation faster!
2
+
3
+ <img src="docs/logo-sm.png" width="15%">
4
+
5
+ **`soak` is a tool to enable qualitative researchers to rapidly define and run llm-assisted text analysis pipelines and thematic analysis.**
6
+
7
+ The easiest way to see what `soak` does is to see sample outputs from the system.
8
+
9
+ The Zero-shot pipeline diagram shows the various stages the analysis involves:
10
+
11
+ ![an analysis pipeline](docs/images/zsmermaid.png)
12
+
13
+ Input text from [patient interviews](soak/data/cfs/):
14
+
15
+ ![raw data](docs/images/cfstext.png)
16
+
17
+ Sample theme extracted:
18
+
19
+ ![themes extracted](docs/images/theme.png)
20
+
21
+ Matching LLM extracted quotes to source text to detect hallucinations:
22
+
23
+ ![alt text](docs/images/quotematching.png)
24
+
25
+
26
+ A classification prompt, extracting structured data from transcripts. The green element is the templated input. The blue elements like `[[this]]` indicate the LLM-completions. Prompts are written in [struckdown](https://github.com/benwhalley/struckdown), is a simple text-based format used to constrain the LLM output to a specific data type/structure.
27
+
28
+ ![A struckdown prompt](docs/images/classifyprompt.png)
29
+
30
+ Inter-rater agreement and ground truth validation statistics, calculated for structured data extracted from transcripts:
31
+
32
+ ![IRR statistics](docs/images/rateragreement.png)
33
+
34
+ **Ground truth validation:** Classifier nodes can automatically validate LLM outputs against ground truth labels, calculating precision, recall, F1, and confusion matrices:
35
+
36
+ ```yaml
37
+ ground_truths:
38
+ reflection:
39
+ existing: reflection_exists # Ground truth column
40
+ mapping: {yes: 1, no: 0} # Map LLM outputs to GT values
41
+ ```
42
+
43
+ See [Ground Truth Validation](docs/how-to/ground-truth-validation.md) for details.
44
+
45
+ Plots and similarity statistics quantify the similarity between sets of themes created by different analyses. For example we might compare different LLMs, different datasets (patients vs doctors) or different prompts (amending the research question posed to the LLM). The heatmap reveals common themes between different analyses or datasets:
46
+
47
+ ![heatmap](docs/images/plot.png)
48
+
49
+ Similarity statistics quantify the similarity between sets of themes created by different analyses.
50
+
51
+ ![similarity statistics](docs/images/simstats.png)
52
+
53
+
54
+ ### Sample outouts
55
+
56
+ - [cfs1_simple.html](https://benwhalley.github.io/soak/samples/cfs1_simple.html) shows a thematic analysis of transcripts of 8 patients with ME/CFS or Long COVID.
57
+
58
+ - [cfs2_pipeline.html](https://benwhalley.github.io/soak/samples/cfs2_simple.html) shows the same analysis using a different LLM model, and in extended HTML format.
59
+
60
+ - [comparison.html](https://benwhalley.github.io/soak/samples/comparison.html) shows the comparison of these two analyses.
61
+
62
+ - [20251008_085446_5db6_pipeline.html](https://benwhalley.github.io/soak/samples/classifier/20251008_085446_5db6_pipeline.html) shows the result of a different pipeline extracting structured data from the transcripts (results are also available as json and csv).
63
+
64
+ ### Example pipeline specifications
65
+
66
+ - [soak/pipelines/zs.soak](soak/pipelines/zs.soak) is the Zero-shot pipeline used in the sample outputs above.
67
+
68
+ - [classifier.soak](docs/samples/classifier/classifier.soak) is the classifier pipeline used in the sample output above.
69
+
70
+ ## Quick Start
71
+
72
+ ```bash
73
+ # install
74
+ git clone https://github.com/benwhalley/soak
75
+ uv install . tool
76
+
77
+ # set credentials, using openai for simplicity
78
+ export LLM_API_KEY=your_api_key
79
+ export LLM_API_BASE=https://api.openai.com/v1
80
+
81
+ # Run analysis
82
+ soak zs soak/data/cfs/*.txt -t simple -o cfs-simple-1
83
+
84
+ # Open results in a browser
85
+ open cfs-simple-1_simple.html
86
+
87
+ # Re-run with a different/better model
88
+ soak zs -o cfs-simple-2 --model-name="openai/gpt-4o" soak/data/cfs/*.txt
89
+
90
+ # Compare results
91
+ soak compare cfs-simple-1.json cfs-simple-2.json -o comparison.html
92
+ ```
93
+
94
+
95
+ ## More usage
96
+
97
+ ```bash
98
+ # Basic pattern
99
+ uv run soak <pipeline> <files> --output <name>
100
+
101
+ # Run demo pipeline on sample text files
102
+ uv run soak demo --output demo_analysis soak/data/cfs/*.txt
103
+
104
+ # Use the 'simple' html output template
105
+ uv run soak zs -t simple --output analysis_simple soak/data/cfs/*.txt
106
+ ```
107
+
108
+ ### Working with CSV/XLSX Spreadsheets
109
+
110
+ CSV and XLSX files are fully supported. Each row becomes a separate document, with column values accessible in templates as `{{column_name}}`.
111
+
112
+ **Example data** (`soak/data/test_data.csv`):
113
+ ```csv
114
+ participant_id,age,condition,response
115
+ P001,25,control,I felt very relaxed during the session
116
+ P002,32,treatment,The intervention helped me focus better
117
+ ```
118
+
119
+ **Run classifier on CSV:**
120
+ ```bash
121
+ uv run soak classifier_tabular --output csv_analysis soak/data/test_data.csv
122
+ ```
123
+
124
+ **Pipeline template accessing columns:**
125
+
126
+ ```yaml
127
+ # pipeline.soak
128
+ nodes:
129
+ - name: analyze
130
+ type: Map
131
+ inputs: [documents]
132
+ ---#analyze
133
+ Participant {{participant_id}} (age {{age}}, {{condition}} group):
134
+ {{response}}
135
+
136
+ Summarize the response: [[summary:str]]
137
+ ```
138
+
139
+ **Sampling options:**
140
+ ```bash
141
+ # Process first 10 rows only (useful for testing)
142
+ uv run soak classifier_tabular --head 10 --output test_run survey.csv
143
+
144
+ # Randomly sample 50 rows
145
+ uv run soak classifier_tabular --sample 50 --output pilot survey.csv
146
+ ```
147
+
148
+ See [Working with Spreadsheet Data](docs/how-to/working-with-spreadsheet-data.md) for more details.
149
+
150
+
151
+ **Common Options:**
152
+ - `--output, -o`: Output filename (generates .json dump file and .html)
153
+ - `--model-name`: LLM model (default: gpt-4o-mini)
154
+ - `-c, --context`: Pipeline context variables (e.g., `-c research_question="Experiences of patients with COVID-19"`)
155
+
156
+
157
+ ## Documentation
158
+
159
+ - [Docs index](docs/index.md)
160
+ - [Getting started](docs/tutorials/getting-started.md)
161
+
162
+ See [CLAUDE.md](CLAUDE.md) for architecture details.
163
+
164
+
165
+ ## License
166
+
167
+ AGPL v3 or later
168
+
169
+ Please cite: Ben Whalley. (2025). benwhalley/soak: Initial public release (v0.3.0). Zenodo. https://doi.org/10.5281/zenodo.17293023
@@ -0,0 +1,99 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "soaking"
7
+ version = "0.3.5"
8
+ description = "soak: graph-based pipelines and tools for LLM-assisted qualitative text analysis"
9
+ authors = [
10
+ { name = "Ben Whalley", email = "ben.whalley@plymouth.ac.uk" }
11
+ ]
12
+ license = { text = "AGPL-3.0-or-later" }
13
+ readme = "README.md"
14
+ requires-python = ">=3.12,<3.13"
15
+ dependencies = [
16
+ "instructor>=1.10.0",
17
+ "jinja2>=3.1.6",
18
+ "lark>=1.2.2",
19
+ "matplotlib>=3.10.3",
20
+ "networkx>=3.5",
21
+ "pandas>=2.3.1",
22
+ "pdfplumber>=0.11.7",
23
+ "pydantic>=2.11.7",
24
+ "python-box>=7.3.2",
25
+ "python-decouple>=3.8",
26
+ "python-docx>=1.2.0",
27
+ "python-magic>=0.4.27",
28
+ "scikit-learn>=1.7.1",
29
+ "scipy>=1.14",
30
+ "seaborn>=0.13.2",
31
+ "tiktoken>=0.9.0",
32
+ "typer>=0.16.0",
33
+ "umap-learn",
34
+ "asyncpg>=0.30.0",
35
+ "jinja-markdown>=1.210911",
36
+ "struckdown",
37
+ "trogon>=0.6.0",
38
+ "nltk>=3.9.2",
39
+ "rank-bm25>=0.2.2",
40
+ "openpyxl>=3.1.0",
41
+ "statsmodels>=0.14.0",
42
+ "krippendorff>=0.6.0",
43
+ "pyirr>=0.84.1.2",
44
+ "setuptools>=80.9.0",
45
+ "irrcac",
46
+ "pysbd>=0.3.4",
47
+ "tqdm>=4.67.0",
48
+ "simpleeval>=1.0.3",
49
+ "mkdocs>=1.6.0",
50
+ "mkdocs-material>=9.5.0",
51
+ "pymdown-extensions>=10.11.0",
52
+ "graphviz>=0.20.0",
53
+ "scrubadub>=2.0.1",
54
+ "spacy>=3.8.4,<3.9",
55
+ "spacy-transformers",
56
+ "scrubadub-spacy",
57
+ "transformers>=4.51.0",
58
+ "sentence-transformers>=2.5.1",
59
+ "pot>=0.9.6.post1",
60
+ "pyphen>=0.16.0",
61
+ "plotly>=5.18.0",
62
+ "tenacity>=8.2.0",
63
+ "hdbscan>=0.8.33",
64
+ ]
65
+
66
+ [project.optional-dependencies]
67
+ scrub = [
68
+ "scrubadub>=2.0.0",
69
+ "scrubadub-spacy>=2.0.0",
70
+ "spacy>=3.8.4,<3.9",
71
+ ]
72
+
73
+
74
+ [tool.setuptools.package-data]
75
+ soak = ["templates/*.html", "templates/*.md", "templates/**/*.html", "templates/**/*.md", "templates/**/*.sd", "pipelines/*.soak", "pipelines/*.sd", "soak-data/**/*"]
76
+
77
+ [tool.setuptools.packages.find]
78
+ where = ["."]
79
+ include = ["soak*",]
80
+
81
+ [project.scripts]
82
+ soak = "soak.cli:main_with_default_command"
83
+
84
+ [tool.uv]
85
+ package = true
86
+
87
+ [tool.uv.sources]
88
+ # Local development version
89
+ # struckdown = { path = "/Users/benwhalley/dev/struckdown", editable = true }
90
+ struckdown = { git = "https://github.com/benwhalley/struckdown", branch = "main" }
91
+ irrcac = { git = "https://github.com/benwhalley/irrCAC.git" }
92
+ spacy-transformers = { git = "https://github.com/explosion/spacy-transformers.git", rev = "bf2fe03" }
93
+ umap-learn = { git = "https://github.com/lmcinnes/umap", branch = "master" }
94
+
95
+
96
+ [tool.pytest.ini_options]
97
+ pythonpath = ["."]
98
+
99
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ """Automated qualitative analysis using language models."""