PyPI - sudregex - Versions diffs - 0.1.0__tar.gz - Mend

sudregex 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

sudregex-0.1.0/LICENSE +5 -0
sudregex-0.1.0/PKG-INFO +203 -0
sudregex-0.1.0/README.md +160 -0
sudregex-0.1.0/pyproject.toml +61 -0
sudregex-0.1.0/setup.cfg +4 -0
sudregex-0.1.0/sudregex/__init__.py +392 -0
sudregex-0.1.0/sudregex/checklist.py +234 -0
sudregex-0.1.0/sudregex/cli.py +162 -0
sudregex-0.1.0/sudregex/helper.py +663 -0
sudregex-0.1.0/sudregex/termslist.py +199 -0
sudregex-0.1.0/sudregex/validation.py +187 -0
sudregex-0.1.0/sudregex.egg-info/PKG-INFO +203 -0
sudregex-0.1.0/sudregex.egg-info/SOURCES.txt +15 -0
sudregex-0.1.0/sudregex.egg-info/dependency_links.txt +1 -0
sudregex-0.1.0/sudregex.egg-info/entry_points.txt +2 -0
sudregex-0.1.0/sudregex.egg-info/requires.txt +24 -0
sudregex-0.1.0/sudregex.egg-info/top_level.txt +1 -0

sudregex-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,5 @@
+MIT License
+Copyright (c) 2025 QuantitativeNurse Lab | Vanderbilt Medical Center

sudregex-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,203 @@
+Metadata-Version: 2.4
+Name: sudregex
+Version: 0.1.0
+Summary: Regex-driven extraction with negation for clinical text (SUD-focused).
+Author-email: Quantitative Nurse Lab <quantitativenurse@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/quantitativenurse/sud-regex
+Project-URL: Issues, https://github.com/quantitativenurse/sud-regex/issues
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas>=1.5
+Requires-Dist: numpy>=1.21
+Provides-Extra: dev
+Requires-Dist: black==25.1.0; extra == "dev"
+Requires-Dist: flake8==7.3.0; extra == "dev"
+Requires-Dist: isort==6.0.1; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: build; extra == "dev"
+Requires-Dist: twine; extra == "dev"
+Provides-Extra: viz
+Requires-Dist: matplotlib>=3.6; extra == "viz"
+Provides-Extra: yaml
+Requires-Dist: pyyaml>=6; extra == "yaml"
+Provides-Extra: parallel
+Requires-Dist: pandarallel>=1.6; extra == "parallel"
+Provides-Extra: all
+Requires-Dist: matplotlib>=3.6; extra == "all"
+Requires-Dist: pyyaml>=6; extra == "all"
+Requires-Dist: pandarallel>=1.6; extra == "all"
+Dynamic: license-file
+[![CI](https://github.com/quantitativenurse/sud-regex/actions/workflows/lint.yml/badge.svg)](https://github.com/quantitativenurse/sud-regex/actions)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+# sudregex
+> **Version:** 0.1.0
+A lightweight, high-throughput pipeline for regex-driven extraction with negation and false-positive pruning—built for Substance Use Disorder (SUD) research, but flexible enough for general clinical text mining.
+---
+## ✨ Features
+- **Negation detection** – Filter matches when preceded by cues (e.g., “no”, “denies”, “not”).
+- **False-positive ** – Drop matches in noisy contexts (e.g., **discharge instructions**, **family history**).
+- **Substance context window** – Confirm that matches occur near a user-supplied vocabulary (e.g., opioid, alcohol terms).
+- **Line-break normalization** – Remove literal markers (default `"$+$"`) and collapse whitespace.
+- **Batteries included** – A ready-to-use “ABC” checklist for common SUD signals.
+- **CLI & Python API** – Use from shell scripts or notebooks.
+- **Deterministic previews** – Sampling uses a fixed seed for reproducible tests.
+---
+## 📦 Installation
+```bash
+# From PyPI (enable after publish)
+pip install sud-regex
+# From source (dev)
+git clone https://github.com/quantitativenurse/sud-regex.git
+cd sud-regex
+python -m venv .venv && source .venv/bin/activate
+pip install -U pip
+pip install -e .[dev]   # installs sudregex + black, isort, flake8, pytest, etc.
+---
+```
+## Usage
+- For interactive usage on notebooks refer to our tutorial <link>
+### Quick Start (CLI)
+```bash
+sudregex --help
+Run extraction (CSV with commas) using the default pruning behavior:
+sudregex --extract \
+  --in_file path/to/notes.csv \
+  --out_file path/to/results.csv \
+  --checklist path/to/checklist.py \
+  --termslist path/to/termslist.py \
+  --terms_active alcohol_terms,opioid_terms \
+  --separator , \
+  --parallel --n-workers 2
+```
+### Discharge-instruction pruning
+By default, sudregex **excludes** matches that occur in discharge-instruction contexts.
+- **Default:** no flag needed, or explicit:
+```bash
+  sudregex --extract ... --exclude-discharge-mentions
+Turn pruning OFF (keep discharge-context hits):
+sudregex --extract \
+  --in_file path/to/notes.csv \
+  --out_file path/to/results_raw.csv \
+  --checklist path/to/checklist.py \
+  --termslist path/to/termslist.py \
+  --terms_active alcohol_terms \
+  --no-exclude-discharge-mentions
+```
+### Use a custom separator (example: a unique token unlikely to appear in notes):
+Clinical notes often contain commas, semicolons, tabs and other common punctuation marks as part of natural language. Using these as delimiters can lead to unintended splits and parsing errors, especially when extracting structured information from note text fields.
+In our work, we use the custom marker |^| because:
+  It is highly unlikely to appear naturally in clinical documentation.
+  It provides a clear, unambiguous boundary between segments.
+  It avoids conflicts with commonly used punctuation, improving extraction accuracy.
+  It simplifies line-break normalization and downstream processing.
+This choice ensures that our pipeline remains robust across diverse note formats.
+```bash
+sudregex --extract \
+  --in_file path/to/notes.txt \
+  --out_file path/to/results.csv \
+  --checklist path/to/checklist.py \
+  --termslist path/to/termslist.py \
+  --terms_active opioid_terms \
+  --separator $'|^|'    # or any safe custom delimiter
+```
+---
+### Quickstart (Python API)
+```bash
+import sudregex as sud
+# Use the packaged defaults if desired
+checklist = sud.checklist_abc
+terms = sud.default_termslist
+# DataFrame API
+df_results = sud.extract_df(
+    df=my_notes_df,                  # columns: note_id, note_text (and optional grid)
+    checklist=checklist,
+    termslist=terms,
+    terms_active="alcohol_terms,opioid_terms",
+    parallel=True,                   # <— enable parallel apply (if pandarallel is installed)
+    n_workers=2,
+    include_note_text=False,
+    exclude_discharge_mentions=True, # default True; set False to disable pruning
+)
+# File API (CSV/TSV/…)
+result = sud.extract(
+    in_file="notes.csv",
+    out_file="results.csv",
+    checklist="path/to/checklist.py",
+    separator=",",
+    termslist="path/to/termslist.py",
+    terms_active="opioid_terms",
+    parallel=True,
+    n_workers=2,
+    include_note_text=False,
+    exclude_discharge_mentions=False, # keep raw matches even in discharge contexts
+)
+```
+---
+The default checklist and termslist are available using the below method.
+checklist = sud.checklist_abc
+checklist
+termslist = sud.default_termslist
+termslist
+---
+## License
+MIT – see LICENSE for details.
+## 📣 Citation / Acknowledgements
+If **sudregex** is useful in your work, please cite:
+Quantitative Nurse Lab. (2025). *sudregex* (Version 0.1.0). GitHub. https://github.com/quantitativenurse/sud-regex
+**Acknowledgements:**
+Thanks to all contributors and collaborators for feedback and testing.
+---

sudregex-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,160 @@
+[![CI](https://github.com/quantitativenurse/sud-regex/actions/workflows/lint.yml/badge.svg)](https://github.com/quantitativenurse/sud-regex/actions)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+# sudregex
+> **Version:** 0.1.0
+A lightweight, high-throughput pipeline for regex-driven extraction with negation and false-positive pruning—built for Substance Use Disorder (SUD) research, but flexible enough for general clinical text mining.
+---
+## ✨ Features
+- **Negation detection** – Filter matches when preceded by cues (e.g., “no”, “denies”, “not”).
+- **False-positive ** – Drop matches in noisy contexts (e.g., **discharge instructions**, **family history**).
+- **Substance context window** – Confirm that matches occur near a user-supplied vocabulary (e.g., opioid, alcohol terms).
+- **Line-break normalization** – Remove literal markers (default `"$+$"`) and collapse whitespace.
+- **Batteries included** – A ready-to-use “ABC” checklist for common SUD signals.
+- **CLI & Python API** – Use from shell scripts or notebooks.
+- **Deterministic previews** – Sampling uses a fixed seed for reproducible tests.
+---
+## 📦 Installation
+```bash
+# From PyPI (enable after publish)
+pip install sud-regex
+# From source (dev)
+git clone https://github.com/quantitativenurse/sud-regex.git
+cd sud-regex
+python -m venv .venv && source .venv/bin/activate
+pip install -U pip
+pip install -e .[dev]   # installs sudregex + black, isort, flake8, pytest, etc.
+---
+```
+## Usage
+- For interactive usage on notebooks refer to our tutorial <link>
+### Quick Start (CLI)
+```bash
+sudregex --help
+Run extraction (CSV with commas) using the default pruning behavior:
+sudregex --extract \
+  --in_file path/to/notes.csv \
+  --out_file path/to/results.csv \
+  --checklist path/to/checklist.py \
+  --termslist path/to/termslist.py \
+  --terms_active alcohol_terms,opioid_terms \
+  --separator , \
+  --parallel --n-workers 2
+```
+### Discharge-instruction pruning
+By default, sudregex **excludes** matches that occur in discharge-instruction contexts.
+- **Default:** no flag needed, or explicit:
+```bash
+  sudregex --extract ... --exclude-discharge-mentions
+Turn pruning OFF (keep discharge-context hits):
+sudregex --extract \
+  --in_file path/to/notes.csv \
+  --out_file path/to/results_raw.csv \
+  --checklist path/to/checklist.py \
+  --termslist path/to/termslist.py \
+  --terms_active alcohol_terms \
+  --no-exclude-discharge-mentions
+```
+### Use a custom separator (example: a unique token unlikely to appear in notes):
+Clinical notes often contain commas, semicolons, tabs and other common punctuation marks as part of natural language. Using these as delimiters can lead to unintended splits and parsing errors, especially when extracting structured information from note text fields.
+In our work, we use the custom marker |^| because:
+  It is highly unlikely to appear naturally in clinical documentation.
+  It provides a clear, unambiguous boundary between segments.
+  It avoids conflicts with commonly used punctuation, improving extraction accuracy.
+  It simplifies line-break normalization and downstream processing.
+This choice ensures that our pipeline remains robust across diverse note formats.
+```bash
+sudregex --extract \
+  --in_file path/to/notes.txt \
+  --out_file path/to/results.csv \
+  --checklist path/to/checklist.py \
+  --termslist path/to/termslist.py \
+  --terms_active opioid_terms \
+  --separator $'|^|'    # or any safe custom delimiter
+```
+---
+### Quickstart (Python API)
+```bash
+import sudregex as sud
+# Use the packaged defaults if desired
+checklist = sud.checklist_abc
+terms = sud.default_termslist
+# DataFrame API
+df_results = sud.extract_df(
+    df=my_notes_df,                  # columns: note_id, note_text (and optional grid)
+    checklist=checklist,
+    termslist=terms,
+    terms_active="alcohol_terms,opioid_terms",
+    parallel=True,                   # <— enable parallel apply (if pandarallel is installed)
+    n_workers=2,
+    include_note_text=False,
+    exclude_discharge_mentions=True, # default True; set False to disable pruning
+)
+# File API (CSV/TSV/…)
+result = sud.extract(
+    in_file="notes.csv",
+    out_file="results.csv",
+    checklist="path/to/checklist.py",
+    separator=",",
+    termslist="path/to/termslist.py",
+    terms_active="opioid_terms",
+    parallel=True,
+    n_workers=2,
+    include_note_text=False,
+    exclude_discharge_mentions=False, # keep raw matches even in discharge contexts
+)
+```
+---
+The default checklist and termslist are available using the below method.
+checklist = sud.checklist_abc
+checklist
+termslist = sud.default_termslist
+termslist
+---
+## License
+MIT – see LICENSE for details.
+## 📣 Citation / Acknowledgements
+If **sudregex** is useful in your work, please cite:
+Quantitative Nurse Lab. (2025). *sudregex* (Version 0.1.0). GitHub. https://github.com/quantitativenurse/sud-regex
+**Acknowledgements:**
+Thanks to all contributors and collaborators for feedback and testing.
+---

sudregex-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,61 @@
+[build-system]
+requires = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "sudregex"
+version = "0.1.0"
+description = "Regex-driven extraction with negation for clinical text (SUD-focused)."
+readme = { file = "README.md", content-type = "text/markdown" }
+requires-python = ">=3.9"
+license = { text = "MIT" }
+authors = [{ name = "Quantitative Nurse Lab", email = "quantitativenurse@gmail.com" }]
+dependencies = [
+  "pandas>=1.5",
+  "numpy>=1.21",
+]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Intended Audience :: Science/Research",
+  "License :: OSI Approved :: MIT License",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Topic :: Scientific/Engineering :: Information Analysis",
+  "Operating System :: OS Independent",
+]
+[project.optional-dependencies]
+dev = [
+  "black==25.1.0",
+  "flake8==7.3.0",
+  "isort==6.0.1",
+  "pytest",
+  "build",
+  "twine",
+]
+viz = ["matplotlib>=3.6"]
+yaml = ["pyyaml>=6"]
+parallel = ["pandarallel>=1.6"]     # optional: for --parallel / n_workers
+all = ["matplotlib>=3.6", "pyyaml>=6", "pandarallel>=1.6"]  # include parallel
+[project.scripts]
+sudregex = "sudregex.cli:main"
+[project.urls]
+Homepage = "https://github.com/quantitativenurse/sud-regex"
+Issues = "https://github.com/quantitativenurse/sud-regex/issues"
+[tool.setuptools.packages.find]
+include = ["sudregex*"]
+[tool.black]
+line-length = 120
+target-version = ["py311"]
+[tool.isort]
+profile = "black"
+line_length = 120

sudregex-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0