sudregex 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sudregex-0.1.0/LICENSE ADDED
@@ -0,0 +1,5 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 QuantitativeNurse Lab | Vanderbilt Medical Center
4
+
5
+
@@ -0,0 +1,203 @@
1
+ Metadata-Version: 2.4
2
+ Name: sudregex
3
+ Version: 0.1.0
4
+ Summary: Regex-driven extraction with negation for clinical text (SUD-focused).
5
+ Author-email: Quantitative Nurse Lab <quantitativenurse@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/quantitativenurse/sud-regex
8
+ Project-URL: Issues, https://github.com/quantitativenurse/sud-regex/issues
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
19
+ Classifier: Operating System :: OS Independent
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pandas>=1.5
24
+ Requires-Dist: numpy>=1.21
25
+ Provides-Extra: dev
26
+ Requires-Dist: black==25.1.0; extra == "dev"
27
+ Requires-Dist: flake8==7.3.0; extra == "dev"
28
+ Requires-Dist: isort==6.0.1; extra == "dev"
29
+ Requires-Dist: pytest; extra == "dev"
30
+ Requires-Dist: build; extra == "dev"
31
+ Requires-Dist: twine; extra == "dev"
32
+ Provides-Extra: viz
33
+ Requires-Dist: matplotlib>=3.6; extra == "viz"
34
+ Provides-Extra: yaml
35
+ Requires-Dist: pyyaml>=6; extra == "yaml"
36
+ Provides-Extra: parallel
37
+ Requires-Dist: pandarallel>=1.6; extra == "parallel"
38
+ Provides-Extra: all
39
+ Requires-Dist: matplotlib>=3.6; extra == "all"
40
+ Requires-Dist: pyyaml>=6; extra == "all"
41
+ Requires-Dist: pandarallel>=1.6; extra == "all"
42
+ Dynamic: license-file
43
+
44
+ [![CI](https://github.com/quantitativenurse/sud-regex/actions/workflows/lint.yml/badge.svg)](https://github.com/quantitativenurse/sud-regex/actions)
45
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
46
+
47
+ # sudregex
48
+
49
+ > **Version:** 0.1.0
50
+
51
+ A lightweight, high-throughput pipeline for regex-driven extraction with negation and false-positive pruning—built for Substance Use Disorder (SUD) research, but flexible enough for general clinical text mining.
52
+
53
+ ---
54
+
55
+ ## ✨ Features
56
+
57
+ - **Negation detection** – Filter matches when preceded by cues (e.g., “no”, “denies”, “not”).
58
+ - **False-positive ** – Drop matches in noisy contexts (e.g., **discharge instructions**, **family history**).
59
+ - **Substance context window** – Confirm that matches occur near a user-supplied vocabulary (e.g., opioid, alcohol terms).
60
+ - **Line-break normalization** – Remove literal markers (default `"$+$"`) and collapse whitespace.
61
+ - **Batteries included** – A ready-to-use “ABC” checklist for common SUD signals.
62
+ - **CLI & Python API** – Use from shell scripts or notebooks.
63
+ - **Deterministic previews** – Sampling uses a fixed seed for reproducible tests.
64
+
65
+ ---
66
+
67
+
68
+ ## 📦 Installation
69
+
70
+ ```bash
71
+ # From PyPI (enable after publish)
72
+ pip install sud-regex
73
+
74
+
75
+ # From source (dev)
76
+ git clone https://github.com/quantitativenurse/sud-regex.git
77
+ cd sud-regex
78
+ python -m venv .venv && source .venv/bin/activate
79
+ pip install -U pip
80
+ pip install -e .[dev] # installs sudregex + black, isort, flake8, pytest, etc.
81
+ ---
82
+ ```
83
+
84
+ ## Usage
85
+ - For interactive usage on notebooks refer to our tutorial <link>
86
+
87
+ ### Quick Start (CLI)
88
+
89
+ ```bash
90
+ sudregex --help
91
+ Run extraction (CSV with commas) using the default pruning behavior:
92
+
93
+ sudregex --extract \
94
+ --in_file path/to/notes.csv \
95
+ --out_file path/to/results.csv \
96
+ --checklist path/to/checklist.py \
97
+ --termslist path/to/termslist.py \
98
+ --terms_active alcohol_terms,opioid_terms \
99
+ --separator , \
100
+ --parallel --n-workers 2
101
+ ```
102
+ ### Discharge-instruction pruning
103
+
104
+ By default, sudregex **excludes** matches that occur in discharge-instruction contexts.
105
+
106
+ - **Default:** no flag needed, or explicit:
107
+ ```bash
108
+ sudregex --extract ... --exclude-discharge-mentions
109
+
110
+ Turn pruning OFF (keep discharge-context hits):
111
+
112
+ sudregex --extract \
113
+ --in_file path/to/notes.csv \
114
+ --out_file path/to/results_raw.csv \
115
+ --checklist path/to/checklist.py \
116
+ --termslist path/to/termslist.py \
117
+ --terms_active alcohol_terms \
118
+ --no-exclude-discharge-mentions
119
+ ```
120
+
121
+ ### Use a custom separator (example: a unique token unlikely to appear in notes):
122
+
123
+ Clinical notes often contain commas, semicolons, tabs and other common punctuation marks as part of natural language. Using these as delimiters can lead to unintended splits and parsing errors, especially when extracting structured information from note text fields.
124
+ In our work, we use the custom marker |^| because:
125
+
126
+ It is highly unlikely to appear naturally in clinical documentation.
127
+ It provides a clear, unambiguous boundary between segments.
128
+ It avoids conflicts with commonly used punctuation, improving extraction accuracy.
129
+ It simplifies line-break normalization and downstream processing.
130
+
131
+ This choice ensures that our pipeline remains robust across diverse note formats.
132
+ ```bash
133
+ sudregex --extract \
134
+ --in_file path/to/notes.txt \
135
+ --out_file path/to/results.csv \
136
+ --checklist path/to/checklist.py \
137
+ --termslist path/to/termslist.py \
138
+ --terms_active opioid_terms \
139
+ --separator $'|^|' # or any safe custom delimiter
140
+ ```
141
+ ---
142
+
143
+ ### Quickstart (Python API)
144
+ ```bash
145
+ import sudregex as sud
146
+
147
+ # Use the packaged defaults if desired
148
+ checklist = sud.checklist_abc
149
+ terms = sud.default_termslist
150
+
151
+ # DataFrame API
152
+ df_results = sud.extract_df(
153
+ df=my_notes_df, # columns: note_id, note_text (and optional grid)
154
+ checklist=checklist,
155
+ termslist=terms,
156
+ terms_active="alcohol_terms,opioid_terms",
157
+ parallel=True, # <— enable parallel apply (if pandarallel is installed)
158
+ n_workers=2,
159
+ include_note_text=False,
160
+ exclude_discharge_mentions=True, # default True; set False to disable pruning
161
+ )
162
+
163
+ # File API (CSV/TSV/…)
164
+ result = sud.extract(
165
+ in_file="notes.csv",
166
+ out_file="results.csv",
167
+ checklist="path/to/checklist.py",
168
+ separator=",",
169
+ termslist="path/to/termslist.py",
170
+ terms_active="opioid_terms",
171
+ parallel=True,
172
+ n_workers=2,
173
+ include_note_text=False,
174
+ exclude_discharge_mentions=False, # keep raw matches even in discharge contexts
175
+ )
176
+
177
+ ```
178
+ ---
179
+
180
+ The default checklist and termslist are available using the below method.
181
+
182
+ checklist = sud.checklist_abc
183
+
184
+ checklist
185
+
186
+ termslist = sud.default_termslist
187
+
188
+ termslist
189
+
190
+ ---
191
+
192
+ ## License
193
+ MIT – see LICENSE for details.
194
+
195
+ ## 📣 Citation / Acknowledgements
196
+
197
+ If **sudregex** is useful in your work, please cite:
198
+
199
+ Quantitative Nurse Lab. (2025). *sudregex* (Version 0.1.0). GitHub. https://github.com/quantitativenurse/sud-regex
200
+
201
+ **Acknowledgements:**
202
+ Thanks to all contributors and collaborators for feedback and testing.
203
+ ---
@@ -0,0 +1,160 @@
1
+ [![CI](https://github.com/quantitativenurse/sud-regex/actions/workflows/lint.yml/badge.svg)](https://github.com/quantitativenurse/sud-regex/actions)
2
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
3
+
4
+ # sudregex
5
+
6
+ > **Version:** 0.1.0
7
+
8
+ A lightweight, high-throughput pipeline for regex-driven extraction with negation and false-positive pruning—built for Substance Use Disorder (SUD) research, but flexible enough for general clinical text mining.
9
+
10
+ ---
11
+
12
+ ## ✨ Features
13
+
14
+ - **Negation detection** – Filter matches when preceded by cues (e.g., “no”, “denies”, “not”).
15
+ - **False-positive ** – Drop matches in noisy contexts (e.g., **discharge instructions**, **family history**).
16
+ - **Substance context window** – Confirm that matches occur near a user-supplied vocabulary (e.g., opioid, alcohol terms).
17
+ - **Line-break normalization** – Remove literal markers (default `"$+$"`) and collapse whitespace.
18
+ - **Batteries included** – A ready-to-use “ABC” checklist for common SUD signals.
19
+ - **CLI & Python API** – Use from shell scripts or notebooks.
20
+ - **Deterministic previews** – Sampling uses a fixed seed for reproducible tests.
21
+
22
+ ---
23
+
24
+
25
+ ## 📦 Installation
26
+
27
+ ```bash
28
+ # From PyPI (enable after publish)
29
+ pip install sud-regex
30
+
31
+
32
+ # From source (dev)
33
+ git clone https://github.com/quantitativenurse/sud-regex.git
34
+ cd sud-regex
35
+ python -m venv .venv && source .venv/bin/activate
36
+ pip install -U pip
37
+ pip install -e .[dev] # installs sudregex + black, isort, flake8, pytest, etc.
38
+ ---
39
+ ```
40
+
41
+ ## Usage
42
+ - For interactive usage on notebooks refer to our tutorial <link>
43
+
44
+ ### Quick Start (CLI)
45
+
46
+ ```bash
47
+ sudregex --help
48
+ Run extraction (CSV with commas) using the default pruning behavior:
49
+
50
+ sudregex --extract \
51
+ --in_file path/to/notes.csv \
52
+ --out_file path/to/results.csv \
53
+ --checklist path/to/checklist.py \
54
+ --termslist path/to/termslist.py \
55
+ --terms_active alcohol_terms,opioid_terms \
56
+ --separator , \
57
+ --parallel --n-workers 2
58
+ ```
59
+ ### Discharge-instruction pruning
60
+
61
+ By default, sudregex **excludes** matches that occur in discharge-instruction contexts.
62
+
63
+ - **Default:** no flag needed, or explicit:
64
+ ```bash
65
+ sudregex --extract ... --exclude-discharge-mentions
66
+
67
+ Turn pruning OFF (keep discharge-context hits):
68
+
69
+ sudregex --extract \
70
+ --in_file path/to/notes.csv \
71
+ --out_file path/to/results_raw.csv \
72
+ --checklist path/to/checklist.py \
73
+ --termslist path/to/termslist.py \
74
+ --terms_active alcohol_terms \
75
+ --no-exclude-discharge-mentions
76
+ ```
77
+
78
+ ### Use a custom separator (example: a unique token unlikely to appear in notes):
79
+
80
+ Clinical notes often contain commas, semicolons, tabs and other common punctuation marks as part of natural language. Using these as delimiters can lead to unintended splits and parsing errors, especially when extracting structured information from note text fields.
81
+ In our work, we use the custom marker |^| because:
82
+
83
+ It is highly unlikely to appear naturally in clinical documentation.
84
+ It provides a clear, unambiguous boundary between segments.
85
+ It avoids conflicts with commonly used punctuation, improving extraction accuracy.
86
+ It simplifies line-break normalization and downstream processing.
87
+
88
+ This choice ensures that our pipeline remains robust across diverse note formats.
89
+ ```bash
90
+ sudregex --extract \
91
+ --in_file path/to/notes.txt \
92
+ --out_file path/to/results.csv \
93
+ --checklist path/to/checklist.py \
94
+ --termslist path/to/termslist.py \
95
+ --terms_active opioid_terms \
96
+ --separator $'|^|' # or any safe custom delimiter
97
+ ```
98
+ ---
99
+
100
+ ### Quickstart (Python API)
101
+ ```bash
102
+ import sudregex as sud
103
+
104
+ # Use the packaged defaults if desired
105
+ checklist = sud.checklist_abc
106
+ terms = sud.default_termslist
107
+
108
+ # DataFrame API
109
+ df_results = sud.extract_df(
110
+ df=my_notes_df, # columns: note_id, note_text (and optional grid)
111
+ checklist=checklist,
112
+ termslist=terms,
113
+ terms_active="alcohol_terms,opioid_terms",
114
+ parallel=True, # <— enable parallel apply (if pandarallel is installed)
115
+ n_workers=2,
116
+ include_note_text=False,
117
+ exclude_discharge_mentions=True, # default True; set False to disable pruning
118
+ )
119
+
120
+ # File API (CSV/TSV/…)
121
+ result = sud.extract(
122
+ in_file="notes.csv",
123
+ out_file="results.csv",
124
+ checklist="path/to/checklist.py",
125
+ separator=",",
126
+ termslist="path/to/termslist.py",
127
+ terms_active="opioid_terms",
128
+ parallel=True,
129
+ n_workers=2,
130
+ include_note_text=False,
131
+ exclude_discharge_mentions=False, # keep raw matches even in discharge contexts
132
+ )
133
+
134
+ ```
135
+ ---
136
+
137
+ The default checklist and termslist are available using the below method.
138
+
139
+ checklist = sud.checklist_abc
140
+
141
+ checklist
142
+
143
+ termslist = sud.default_termslist
144
+
145
+ termslist
146
+
147
+ ---
148
+
149
+ ## License
150
+ MIT – see LICENSE for details.
151
+
152
+ ## 📣 Citation / Acknowledgements
153
+
154
+ If **sudregex** is useful in your work, please cite:
155
+
156
+ Quantitative Nurse Lab. (2025). *sudregex* (Version 0.1.0). GitHub. https://github.com/quantitativenurse/sud-regex
157
+
158
+ **Acknowledgements:**
159
+ Thanks to all contributors and collaborators for feedback and testing.
160
+ ---
@@ -0,0 +1,61 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sudregex"
7
+ version = "0.1.0"
8
+ description = "Regex-driven extraction with negation for clinical text (SUD-focused)."
9
+ readme = { file = "README.md", content-type = "text/markdown" }
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Quantitative Nurse Lab", email = "quantitativenurse@gmail.com" }]
13
+ dependencies = [
14
+ "pandas>=1.5",
15
+ "numpy>=1.21",
16
+ ]
17
+ classifiers = [
18
+ "Development Status :: 3 - Alpha",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3 :: Only",
23
+ "Programming Language :: Python :: 3.9",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Topic :: Scientific/Engineering :: Information Analysis",
28
+ "Operating System :: OS Independent",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "black==25.1.0",
34
+ "flake8==7.3.0",
35
+ "isort==6.0.1",
36
+ "pytest",
37
+ "build",
38
+ "twine",
39
+ ]
40
+ viz = ["matplotlib>=3.6"]
41
+ yaml = ["pyyaml>=6"]
42
+ parallel = ["pandarallel>=1.6"] # optional: for --parallel / n_workers
43
+ all = ["matplotlib>=3.6", "pyyaml>=6", "pandarallel>=1.6"] # include parallel
44
+
45
+ [project.scripts]
46
+ sudregex = "sudregex.cli:main"
47
+
48
+ [project.urls]
49
+ Homepage = "https://github.com/quantitativenurse/sud-regex"
50
+ Issues = "https://github.com/quantitativenurse/sud-regex/issues"
51
+
52
+ [tool.setuptools.packages.find]
53
+ include = ["sudregex*"]
54
+
55
+ [tool.black]
56
+ line-length = 120
57
+ target-version = ["py311"]
58
+
59
+ [tool.isort]
60
+ profile = "black"
61
+ line_length = 120
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+