rMatch 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ .DS_Store
2
+ .env
3
+ outputs
4
+ downloads
5
+ data/cyoa
6
+ .vscode
7
+ __pycache__
8
+ data
9
+ .cursor
@@ -0,0 +1,21 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v6.0.0
4
+ hooks:
5
+ # list of supported hooks: https://pre-commit.com/hooks.html
6
+ - id: trailing-whitespace
7
+ - id: end-of-file-fixer
8
+ - id: check-yaml
9
+ - id: check-added-large-files
10
+ - id: debug-statements
11
+ - id: detect-private-key
12
+
13
+ - repo: https://github.com/astral-sh/ruff-pre-commit
14
+ # Ruff version.
15
+ rev: v0.14.11
16
+ hooks:
17
+ # Run the linter.
18
+ - id: ruff
19
+ args: [--fix]
20
+ # Run the formatter.
21
+ - id: ruff-format
rmatch-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,291 @@
1
+ Metadata-Version: 2.4
2
+ Name: rMatch
3
+ Version: 0.2.0
4
+ Summary: Match recall segments with story segments.
5
+ Author-email: Gabriel Kressin Palacios <gkressi1@jhu.edu>
6
+ License-Expression: MIT
7
+ Requires-Python: <3.14,>=3.12
8
+ Requires-Dist: accelerate<2,>=1.10.1
9
+ Requires-Dist: anthropic>=0.84.0
10
+ Requires-Dist: bitsandbytes>=0.45.0
11
+ Requires-Dist: chardet>=5.2.0
12
+ Requires-Dist: codecarbon>=2.8.3
13
+ Requires-Dist: einops>=0.8.1
14
+ Requires-Dist: huggingface>=0.0.1
15
+ Requires-Dist: kaleido>=1.2.0
16
+ Requires-Dist: krippendorff>=0.8.2
17
+ Requires-Dist: matplotlib<4,>=3.10.7
18
+ Requires-Dist: nltk>=3.9.2
19
+ Requires-Dist: numpy<3,>=2.3.3
20
+ Requires-Dist: openai>=2.15.0
21
+ Requires-Dist: openpyxl<4,>=3.1.5
22
+ Requires-Dist: pandas<3,>=2.3.3
23
+ Requires-Dist: plotly>=6.6.0
24
+ Requires-Dist: pre-commit<5,>=4.3.0
25
+ Requires-Dist: python-dotenv<2,>=1.1.1
26
+ Requires-Dist: rich<15,>=14.2.0
27
+ Requires-Dist: ruff<0.15,>=0.14.0
28
+ Requires-Dist: scikit-learn>=1.7.2
29
+ Requires-Dist: sentence-transformers<6,>=5.1.1
30
+ Requires-Dist: spacy<4,>=3.8.7
31
+ Requires-Dist: tiktoken>=0.12.0
32
+ Requires-Dist: torch<2.10,>=2.9
33
+ Requires-Dist: torchvision<0.25,>=0.24
34
+ Requires-Dist: tqdm<5,>=4.67.1
35
+ Requires-Dist: transformers<4.57.0
36
+ Description-Content-Type: text/markdown
37
+
38
+ <h1 align="center">rMatch</h1>
39
+
40
+ <p align="center">Automatic recall & story matching tool.</p>
41
+
42
+ <p align="center">
43
+ <a href="https://www.python.org/"><img alt="" src="https://img.shields.io/badge/code-Python-blue?logo=Python"></a>
44
+ <a href="https://docs.astral.sh/ruff/"><img alt="Ruff" src="https://img.shields.io/badge/code%20style-Ruff-green?logo=Ruff"></a>
45
+ <a href="https://docs.astral.sh/uv/"><img alt="packaging framework: uv" src="https://img.shields.io/badge/packaging-uv-lightblue?logo=uv"></a>
46
+ <a href="https://pre-commit.com/"><img alt="pre-commit" src="https://img.shields.io/badge/tool-Pre%20Commit-yellow?logo=Pre-Commit"></a>
47
+
48
+
49
+ ## Quick start
50
+
51
+ ### Command line
52
+
53
+ ```sh
54
+ pip install rmatch
55
+
56
+ # single recall file
57
+ rmatch story.txt recall.txt --matcher anthropic
58
+
59
+ # directory of recall files (one per subject)
60
+ rmatch story.txt recalls/ --matcher anthropic
61
+
62
+ # estimate API cost without sending requests
63
+ rmatch story.txt recalls/ --matcher openai --dry-run
64
+ ```
65
+
66
+ ### Python API
67
+
68
+ ```python
69
+ from rmatch import Matcher
70
+
71
+ matcher = Matcher(matcher_name="anthropic", api_key="your_api_key")
72
+ matches = matcher.match(
73
+ story_segments=["The cat sat on the mat.", "It purred softly."],
74
+ recall_segments=["A cat was on a mat."],
75
+ )
76
+ # [(0, [0])] — recall segment 0 matched story segment 0
77
+ ```
78
+
79
+ Or use `run_matching` to load files, run matching, and save results in one call:
80
+
81
+ ```python
82
+ from rmatch.match import run_matching
83
+
84
+ results = run_matching(
85
+ story_file="story.txt",
86
+ recall_file="recalls/",
87
+ matcher_name="anthropic",
88
+ api_key="your_api_key",
89
+ )
90
+ ```
91
+
92
+ ## Setup API keys
93
+
94
+ API keys are resolved in this order (first match wins):
95
+
96
+ 1. **`api_key` argument** passed directly in Python
97
+ 2. **`.env` file** in the current working directory
98
+ 3. **Environment variables** already set in your shell
99
+
100
+ Set them as environment variables:
101
+
102
+ ```sh
103
+ export ANTHROPIC_API_KEY="your_api_key" # for --matcher anthropic (default)
104
+ export OPENAI_API_KEY="your_api_key" # for --matcher openai
105
+ export HF_TOKEN="your_hf_token" # for --matcher huggingface
106
+ ```
107
+
108
+ Or put a `.env` file in your working directory:
109
+
110
+ ```sh
111
+ ANTHROPIC_API_KEY="your_api_key"
112
+ OPENAI_API_KEY="your_api_key"
113
+ HF_TOKEN="your_hf_token"
114
+ ```
115
+
116
+ ## Output format
117
+
118
+ A JSON file with:
119
+
120
+ ```json
121
+ {
122
+ "matcher_name": "anthropic",
123
+ "story_name": "story",
124
+ "story_segmentation": "lines",
125
+ "recall_segmentation": "lines",
126
+ "matches": {
127
+ "sub-001": [[0, [3, 7]], [1, [12]]],
128
+ "sub-002": [[0, [1]], [1, [5, 6]]]
129
+ }
130
+ }
131
+ ```
132
+
133
+ Each entry in `matches` maps a subject ID to a list of `[recall_segment_id, [matched_story_segment_ids...]]` pairs.
134
+
135
+ ## Benchmarking
136
+
137
+ Requires [rBench](https://github.com/GabrielKP/rBench):
138
+
139
+ ```sh
140
+ # outside of this dir
141
+ git clone git@github.com:GabrielKP/rBench.git
142
+ ```
143
+
144
+ Add to `.env` or environment:
145
+ ```sh
146
+ BENCHMARK_ROOT="path/to/rBench"
147
+ ```
148
+
149
+ Run:
150
+ ```sh
151
+ uv run src/rmatch/evaluate.py {alice,monthiversary,memsearch}
152
+ ```
153
+
154
+ ---
155
+
156
+ ## API / Documentation
157
+
158
+ ### Input formats
159
+
160
+ **Story file** — a `.txt` or `.json` file containing the story segments to match against.
161
+
162
+ - **`.txt`**: one segment per line (blank lines are ignored).
163
+ - **`.json`**: must contain a `"segments"` array of strings. Optionally includes `"segmentation_method"`.
164
+
165
+ ```json
166
+ {
167
+ "segmentation_method": "sentences",
168
+ "segments": [
169
+ "The cat sat on the mat.",
170
+ "It purred softly."
171
+ ]
172
+ }
173
+ ```
174
+
175
+ **Recall file** — a `.txt` file, a `.json` file, or a **directory** of either.
176
+
177
+ - **`.txt` file**: one recall segment per line. The filename stem is used as the subject ID.
178
+ - **`.json` file**: must contain a `"recalls"` object mapping subject IDs to segment arrays.
179
+ - **Directory**: all `.txt` or all `.json` files inside are loaded (mixing formats is not allowed). Each `.txt` file becomes one subject; `.json` files are merged.
180
+
181
+ ```json
182
+ {
183
+ "segmentation_method": "clauses",
184
+ "recalls": {
185
+ "sub-001": ["A cat was on a mat.", "It was purring."],
186
+ "sub-002": ["There was a cat on something."]
187
+ }
188
+ }
189
+ ```
190
+
191
+ ### CLI reference
192
+
193
+ ```
194
+ rmatch STORY_FILE RECALL_FILE [options]
195
+ ```
196
+
197
+ #### General options
198
+
199
+ - **`STORY_FILE`** *(positional, required)* — Path to the story `.txt` or `.json` file.
200
+ - **`RECALL_FILE`** *(positional, required)* — Path to a recall `.txt`/`.json` file or a directory of them.
201
+ - **`-M`, `--matcher`** *(str)* — Which matcher backend to use. One of: `anthropic`, `openai`, `reranker`, `huggingface`. Default: `anthropic`.
202
+ - **`-m`, `--model-name`** *(str)* — Override the matcher's default model (see defaults below).
203
+ - **`--track-emissions`** — Enable [CodeCarbon](https://github.com/mlco2/codecarbon) carbon-emissions tracking. Results are saved beside the output file.
204
+ - **`-f`, `--overwrite`** — Overwrite the output file if it already exists.
205
+
206
+ #### LLM matcher options (anthropic, openai, huggingface)
207
+
208
+ - **`--window-size`** *(int)* — Number of surrounding recall segments (before and after) to include as context for each target segment. Set to `0` to disable context. Default: `5`.
209
+ - **`--dry-run`** — *anthropic & openai only.* Estimate token usage and cost without making API calls.
210
+
211
+ #### Self-hosted / HuggingFace options
212
+
213
+ - **`-q`, `--quantization`** *(str)* — Load the model in reduced precision: `4bit` (NF4) or `8bit`. Requires `bitsandbytes`.
214
+ - **`-bs`, `--batch-size`** *(int)* — Number of prompts to process in parallel. Default: `4`.
215
+ - **`--max-new-tokens`** *(int)* — Maximum tokens the model may generate per prompt. Default: `64`.
216
+ - **`--verbose-errors`** — Print the raw model output when parsing fails. Useful for debugging prompt issues.
217
+
218
+ #### Reranker options
219
+
220
+ - **`--device`** *(str)* — PyTorch device for the reranker model (e.g. `cpu`, `cuda`, `mps`). Default: auto.
221
+ - **`--threshold`** *(float)* — Minimum similarity score for a story segment to be considered a match. Default: `0.09`.
222
+ - **`--top-k`** *(int)* — Number of top-scoring story candidates to evaluate per recall segment. Default: `5`.
223
+
224
+ ### Default models
225
+
226
+ - **anthropic** — `claude-opus-4-6`
227
+ - **openai** — `gpt-4.1`
228
+ - **reranker** — `BAAI/bge-reranker-v2-m3`
229
+ - **huggingface** — `meta-llama/Llama-3.2-1B-Instruct`
230
+
231
+ ### Python API
232
+
233
+ #### `Matcher` (main entry point)
234
+
235
+ ```python
236
+ from rmatch import Matcher
237
+
238
+ matcher = Matcher(matcher_name="anthropic", model_name=None, **kwargs)
239
+ matches = matcher.match(story_segments, recall_segments)
240
+ ```
241
+
242
+ `Matcher(matcher_name, **kwargs)` is a factory — it returns the appropriate subclass based on `matcher_name`. All keyword arguments are forwarded to the subclass constructor.
243
+
244
+ **Constructor arguments:**
245
+
246
+ - **`model_name`** *(str)* — Override the default model. Applies to all matchers.
247
+ - **`window_size`** *(int)* — Context window radius around the target recall segment. Default: `5`. Applies to: `anthropic`, `openai`, `huggingface`.
248
+ - **`dry_run`** *(bool)* — Estimate cost without calling the API. Applies to: `anthropic`, `openai`.
249
+ - **`api_key`** *(str)* — API key. Falls back to `.env`, then environment variables. Applies to: `anthropic`, `openai`, `huggingface`.
250
+ - **`device`** *(str)* — PyTorch device string. Applies to: `reranker`.
251
+ - **`threshold`** *(float)* — Score threshold for matches. Default: `0.09`. Applies to: `reranker`.
252
+ - **`top_k`** *(int)* — Top-k candidates per recall segment. Default: `5`. Applies to: `reranker`.
253
+ - **`quantization`** *(str)* — `"4bit"` or `"8bit"`. Applies to: `huggingface`.
254
+ - **`batch_size`** *(int)* — Batch size for inference. Default: `4`. Applies to: `huggingface`.
255
+ - **`max_new_tokens`** *(int)* — Max generated tokens. Default: `64`. Applies to: `huggingface`.
256
+ - **`verbose_errors`** *(bool)* — Log raw output on parse failures. Applies to: `huggingface`.
257
+
258
+ **`matcher.match(story_segments, recall_segments)`**
259
+
260
+ - **`story_segments`** *(list[str])* — Ordered list of story segments (the ground-truth story elements).
261
+ - **`recall_segments`** *(list[str])* — Ordered list of a single participant's recall segments.
262
+
263
+ Returns `list[tuple[int, list[int]]]` — one entry per recall segment:
264
+
265
+ ```python
266
+ [
267
+ (0, [2, 5]), # recall segment 0 matched story segments 2 and 5
268
+ (1, []), # recall segment 1 had no matches
269
+ (2, [0]), # recall segment 2 matched story segment 0
270
+ ]
271
+ ```
272
+
273
+ #### `run_matching` (file-level convenience)
274
+
275
+ ```python
276
+ from rmatch.match import run_matching
277
+
278
+ results = run_matching(
279
+ story_file, # Path — story .txt or .json
280
+ recall_file, # Path — recall file or directory
281
+ matcher_name, # str — "anthropic", "openai", "reranker", "huggingface"
282
+ track_emissions, # bool — enable CodeCarbon tracking
283
+ story_name=None, # str | None — override auto-detected story name
284
+ story_segmentation=None, # str | None — override detected segmentation method
285
+ recall_segmentation=None, # str | None — override detected segmentation method
286
+ overwrite=False, # bool — overwrite existing output file
287
+ **kwargs, # forwarded to the Matcher constructor (model_name, window_size, etc.)
288
+ )
289
+ ```
290
+
291
+ Loads story and recall files, runs matching for every subject, and saves a JSON results file. Returns the output dictionary.
rmatch-0.2.0/README.md ADDED
@@ -0,0 +1,254 @@
1
+ <h1 align="center">rMatch</h1>
2
+
3
+ <p align="center">Automatic recall & story matching tool.</p>
4
+
5
+ <p align="center">
6
+ <a href="https://www.python.org/"><img alt="" src="https://img.shields.io/badge/code-Python-blue?logo=Python"></a>
7
+ <a href="https://docs.astral.sh/ruff/"><img alt="Ruff" src="https://img.shields.io/badge/code%20style-Ruff-green?logo=Ruff"></a>
8
+ <a href="https://docs.astral.sh/uv/"><img alt="packaging framework: uv" src="https://img.shields.io/badge/packaging-uv-lightblue?logo=uv"></a>
9
+ <a href="https://pre-commit.com/"><img alt="pre-commit" src="https://img.shields.io/badge/tool-Pre%20Commit-yellow?logo=Pre-Commit"></a>
10
+
11
+
12
+ ## Quick start
13
+
14
+ ### Command line
15
+
16
+ ```sh
17
+ pip install rmatch
18
+
19
+ # single recall file
20
+ rmatch story.txt recall.txt --matcher anthropic
21
+
22
+ # directory of recall files (one per subject)
23
+ rmatch story.txt recalls/ --matcher anthropic
24
+
25
+ # estimate API cost without sending requests
26
+ rmatch story.txt recalls/ --matcher openai --dry-run
27
+ ```
28
+
29
+ ### Python API
30
+
31
+ ```python
32
+ from rmatch import Matcher
33
+
34
+ matcher = Matcher(matcher_name="anthropic", api_key="your_api_key")
35
+ matches = matcher.match(
36
+ story_segments=["The cat sat on the mat.", "It purred softly."],
37
+ recall_segments=["A cat was on a mat."],
38
+ )
39
+ # [(0, [0])] — recall segment 0 matched story segment 0
40
+ ```
41
+
42
+ Or use `run_matching` to load files, run matching, and save results in one call:
43
+
44
+ ```python
45
+ from rmatch.match import run_matching
46
+
47
+ results = run_matching(
48
+ story_file="story.txt",
49
+ recall_file="recalls/",
50
+ matcher_name="anthropic",
51
+ api_key="your_api_key",
52
+ )
53
+ ```
54
+
55
+ ## Setup API keys
56
+
57
+ API keys are resolved in this order (first match wins):
58
+
59
+ 1. **`api_key` argument** passed directly in Python
60
+ 2. **`.env` file** in the current working directory
61
+ 3. **Environment variables** already set in your shell
62
+
63
+ Set them as environment variables:
64
+
65
+ ```sh
66
+ export ANTHROPIC_API_KEY="your_api_key" # for --matcher anthropic (default)
67
+ export OPENAI_API_KEY="your_api_key" # for --matcher openai
68
+ export HF_TOKEN="your_hf_token" # for --matcher huggingface
69
+ ```
70
+
71
+ Or put a `.env` file in your working directory:
72
+
73
+ ```sh
74
+ ANTHROPIC_API_KEY="your_api_key"
75
+ OPENAI_API_KEY="your_api_key"
76
+ HF_TOKEN="your_hf_token"
77
+ ```
78
+
79
+ ## Output format
80
+
81
+ A JSON file with:
82
+
83
+ ```json
84
+ {
85
+ "matcher_name": "anthropic",
86
+ "story_name": "story",
87
+ "story_segmentation": "lines",
88
+ "recall_segmentation": "lines",
89
+ "matches": {
90
+ "sub-001": [[0, [3, 7]], [1, [12]]],
91
+ "sub-002": [[0, [1]], [1, [5, 6]]]
92
+ }
93
+ }
94
+ ```
95
+
96
+ Each entry in `matches` maps a subject ID to a list of `[recall_segment_id, [matched_story_segment_ids...]]` pairs.
97
+
98
+ ## Benchmarking
99
+
100
+ Requires [rBench](https://github.com/GabrielKP/rBench):
101
+
102
+ ```sh
103
+ # outside of this dir
104
+ git clone git@github.com:GabrielKP/rBench.git
105
+ ```
106
+
107
+ Add to `.env` or environment:
108
+ ```sh
109
+ BENCHMARK_ROOT="path/to/rBench"
110
+ ```
111
+
112
+ Run:
113
+ ```sh
114
+ uv run src/rmatch/evaluate.py {alice,monthiversary,memsearch}
115
+ ```
116
+
117
+ ---
118
+
119
+ ## API / Documentation
120
+
121
+ ### Input formats
122
+
123
+ **Story file** — a `.txt` or `.json` file containing the story segments to match against.
124
+
125
+ - **`.txt`**: one segment per line (blank lines are ignored).
126
+ - **`.json`**: must contain a `"segments"` array of strings. Optionally includes `"segmentation_method"`.
127
+
128
+ ```json
129
+ {
130
+ "segmentation_method": "sentences",
131
+ "segments": [
132
+ "The cat sat on the mat.",
133
+ "It purred softly."
134
+ ]
135
+ }
136
+ ```
137
+
138
+ **Recall file** — a `.txt` file, a `.json` file, or a **directory** of either.
139
+
140
+ - **`.txt` file**: one recall segment per line. The filename stem is used as the subject ID.
141
+ - **`.json` file**: must contain a `"recalls"` object mapping subject IDs to segment arrays.
142
+ - **Directory**: all `.txt` or all `.json` files inside are loaded (mixing formats is not allowed). Each `.txt` file becomes one subject; `.json` files are merged.
143
+
144
+ ```json
145
+ {
146
+ "segmentation_method": "clauses",
147
+ "recalls": {
148
+ "sub-001": ["A cat was on a mat.", "It was purring."],
149
+ "sub-002": ["There was a cat on something."]
150
+ }
151
+ }
152
+ ```
153
+
154
+ ### CLI reference
155
+
156
+ ```
157
+ rmatch STORY_FILE RECALL_FILE [options]
158
+ ```
159
+
160
+ #### General options
161
+
162
+ - **`STORY_FILE`** *(positional, required)* — Path to the story `.txt` or `.json` file.
163
+ - **`RECALL_FILE`** *(positional, required)* — Path to a recall `.txt`/`.json` file or a directory of them.
164
+ - **`-M`, `--matcher`** *(str)* — Which matcher backend to use. One of: `anthropic`, `openai`, `reranker`, `huggingface`. Default: `anthropic`.
165
+ - **`-m`, `--model-name`** *(str)* — Override the matcher's default model (see defaults below).
166
+ - **`--track-emissions`** — Enable [CodeCarbon](https://github.com/mlco2/codecarbon) carbon-emissions tracking. Results are saved beside the output file.
167
+ - **`-f`, `--overwrite`** — Overwrite the output file if it already exists.
168
+
169
+ #### LLM matcher options (anthropic, openai, huggingface)
170
+
171
+ - **`--window-size`** *(int)* — Number of surrounding recall segments (before and after) to include as context for each target segment. Set to `0` to disable context. Default: `5`.
172
+ - **`--dry-run`** — *anthropic & openai only.* Estimate token usage and cost without making API calls.
173
+
174
+ #### Self-hosted / HuggingFace options
175
+
176
+ - **`-q`, `--quantization`** *(str)* — Load the model in reduced precision: `4bit` (NF4) or `8bit`. Requires `bitsandbytes`.
177
+ - **`-bs`, `--batch-size`** *(int)* — Number of prompts to process in parallel. Default: `4`.
178
+ - **`--max-new-tokens`** *(int)* — Maximum tokens the model may generate per prompt. Default: `64`.
179
+ - **`--verbose-errors`** — Print the raw model output when parsing fails. Useful for debugging prompt issues.
180
+
181
+ #### Reranker options
182
+
183
+ - **`--device`** *(str)* — PyTorch device for the reranker model (e.g. `cpu`, `cuda`, `mps`). Default: auto.
184
+ - **`--threshold`** *(float)* — Minimum similarity score for a story segment to be considered a match. Default: `0.09`.
185
+ - **`--top-k`** *(int)* — Number of top-scoring story candidates to evaluate per recall segment. Default: `5`.
186
+
187
+ ### Default models
188
+
189
+ - **anthropic** — `claude-opus-4-6`
190
+ - **openai** — `gpt-4.1`
191
+ - **reranker** — `BAAI/bge-reranker-v2-m3`
192
+ - **huggingface** — `meta-llama/Llama-3.2-1B-Instruct`
193
+
194
+ ### Python API
195
+
196
+ #### `Matcher` (main entry point)
197
+
198
+ ```python
199
+ from rmatch import Matcher
200
+
201
+ matcher = Matcher(matcher_name="anthropic", model_name=None, **kwargs)
202
+ matches = matcher.match(story_segments, recall_segments)
203
+ ```
204
+
205
+ `Matcher(matcher_name, **kwargs)` is a factory — it returns the appropriate subclass based on `matcher_name`. All keyword arguments are forwarded to the subclass constructor.
206
+
207
+ **Constructor arguments:**
208
+
209
+ - **`model_name`** *(str)* — Override the default model. Applies to all matchers.
210
+ - **`window_size`** *(int)* — Context window radius around the target recall segment. Default: `5`. Applies to: `anthropic`, `openai`, `huggingface`.
211
+ - **`dry_run`** *(bool)* — Estimate cost without calling the API. Applies to: `anthropic`, `openai`.
212
+ - **`api_key`** *(str)* — API key. Falls back to `.env`, then environment variables. Applies to: `anthropic`, `openai`, `huggingface`.
213
+ - **`device`** *(str)* — PyTorch device string. Applies to: `reranker`.
214
+ - **`threshold`** *(float)* — Score threshold for matches. Default: `0.09`. Applies to: `reranker`.
215
+ - **`top_k`** *(int)* — Top-k candidates per recall segment. Default: `5`. Applies to: `reranker`.
216
+ - **`quantization`** *(str)* — `"4bit"` or `"8bit"`. Applies to: `huggingface`.
217
+ - **`batch_size`** *(int)* — Batch size for inference. Default: `4`. Applies to: `huggingface`.
218
+ - **`max_new_tokens`** *(int)* — Max generated tokens. Default: `64`. Applies to: `huggingface`.
219
+ - **`verbose_errors`** *(bool)* — Log raw output on parse failures. Applies to: `huggingface`.
220
+
221
+ **`matcher.match(story_segments, recall_segments)`**
222
+
223
+ - **`story_segments`** *(list[str])* — Ordered list of story segments (the ground-truth story elements).
224
+ - **`recall_segments`** *(list[str])* — Ordered list of a single participant's recall segments.
225
+
226
+ Returns `list[tuple[int, list[int]]]` — one entry per recall segment:
227
+
228
+ ```python
229
+ [
230
+ (0, [2, 5]), # recall segment 0 matched story segments 2 and 5
231
+ (1, []), # recall segment 1 had no matches
232
+ (2, [0]), # recall segment 2 matched story segment 0
233
+ ]
234
+ ```
235
+
236
+ #### `run_matching` (file-level convenience)
237
+
238
+ ```python
239
+ from rmatch.match import run_matching
240
+
241
+ results = run_matching(
242
+ story_file, # Path — story .txt or .json
243
+ recall_file, # Path — recall file or directory
244
+ matcher_name, # str — "anthropic", "openai", "reranker", "huggingface"
245
+ track_emissions, # bool — enable CodeCarbon tracking
246
+ story_name=None, # str | None — override auto-detected story name
247
+ story_segmentation=None, # str | None — override detected segmentation method
248
+ recall_segmentation=None, # str | None — override detected segmentation method
249
+ overwrite=False, # bool — overwrite existing output file
250
+ **kwargs, # forwarded to the Matcher constructor (model_name, window_size, etc.)
251
+ )
252
+ ```
253
+
254
+ Loads story and recall files, runs matching for every subject, and saves a JSON results file. Returns the output dictionary.
@@ -0,0 +1,80 @@
1
+ from argparse import ArgumentParser
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+
6
+
7
+ def import_cyoa(cyoa_path: Path | str):
8
+ cyoa_path = Path(cyoa_path)
9
+ if not cyoa_path.exists():
10
+ raise FileNotFoundError(f"Cyoa path {cyoa_path} does not exist")
11
+
12
+ for base_story in ["alice", "monthiversary"]:
13
+ # 1. recall data
14
+ recall_dir = cyoa_path / base_story / "3_pasv"
15
+ recall_paths = sorted(list(recall_dir.glob("*recall.xlsx")))
16
+
17
+ for recall_path in recall_paths:
18
+ # filename/storyname
19
+ filestem_splits = recall_path.stem.split("_")
20
+ story_version = filestem_splits[0][-(len(filestem_splits[0]) - 2) :]
21
+ story_name = f"{base_story}_{story_version}"
22
+ sub_id = filestem_splits[1]
23
+ assert filestem_splits[2].startswith("rate-recall")
24
+
25
+ output_dir_recalls_segmented = (
26
+ Path("data") / "cyoa" / story_name / "recalls" / "segmentation"
27
+ )
28
+ output_dir_recalls_segmented.mkdir(parents=True, exist_ok=True)
29
+
30
+ recall_df = pd.read_excel(recall_path)
31
+
32
+ recall_df = recall_df.rename(
33
+ columns={
34
+ "recalled_events": "events",
35
+ "recall_in_temporal_order": "text",
36
+ }
37
+ )
38
+ recall_df["segment"] = list(range(1, len(recall_df) + 1))
39
+ recall_df = recall_df[["segment", "events", "text"]]
40
+
41
+ recall_df.to_csv(
42
+ output_dir_recalls_segmented / f"{sub_id}.csv",
43
+ index=False,
44
+ )
45
+
46
+ # 2. story transcript data
47
+ transcript_paths = sorted(list(recall_dir.glob("*events.xlsx")))
48
+ for transcript_path in transcript_paths:
49
+ if transcript_path.stem.startswith("~$"):
50
+ continue
51
+ # filename/storyname
52
+ filestem_splits = transcript_path.stem.split("_")
53
+ story_version = filestem_splits[0][-(len(filestem_splits[0]) - 2) :]
54
+ story_name = f"{base_story}_{story_version}"
55
+ assert filestem_splits[2].startswith("events")
56
+
57
+ # output dir
58
+ output_dir_transcripts = Path("data") / "cyoa" / story_name / "transcripts"
59
+ output_dir_transcripts.mkdir(parents=True, exist_ok=True)
60
+
61
+ # load data
62
+ transcript_df = pd.read_excel(transcript_path)
63
+ transcript_df = transcript_df.rename(columns={"story_texts": "text"})
64
+
65
+ transcript_df = transcript_df[["event", "text"]]
66
+
67
+ transcript_df.to_csv(
68
+ output_dir_transcripts / f"{story_name}.csv",
69
+ index=False,
70
+ )
71
+
72
+
73
+ if __name__ == "__main__":
74
+ parser = ArgumentParser()
75
+ parser.add_argument("--cyoa-path", type=str, default="downloads/cyoa")
76
+ args = parser.parse_args()
77
+
78
+ cyoa_path = args.cyoa_path
79
+
80
+ import_cyoa(cyoa_path=cyoa_path)