konsepy 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konsepy/__init__.py +2 -0
- konsepy/bio_tag.py +106 -0
- konsepy/cli.py +54 -0
- konsepy/constants.py +4 -0
- konsepy/get_text_snippets.py +86 -0
- konsepy/importer.py +49 -0
- konsepy/regex.py +118 -0
- konsepy/run_all.py +57 -0
- konsepy/rxutils.py +30 -0
- konsepy/textio.py +87 -0
- konsepy/types.py +9 -0
- konsepy-0.0.1.dist-info/METADATA +34 -0
- konsepy-0.0.1.dist-info/RECORD +14 -0
- konsepy-0.0.1.dist-info/WHEEL +4 -0
konsepy/__init__.py
ADDED
konsepy/bio_tag.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Usage: python bio_tag.py --outdir /path/to/output --infiles infile.csv infile.sas7bdat
|
|
3
|
+
|
|
4
|
+
Post-processing:
|
|
5
|
+
* Identify which `count`/index values should be removed based on noise rules
|
|
6
|
+
* Remove these from the output jsonl file
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import csv
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import spacy
|
|
15
|
+
|
|
16
|
+
from konsepy.cli import add_outdir_and_infiles
|
|
17
|
+
from konsepy.constants import NOTETEXT_LABEL, NOTEDATE_LABEL, NOTEID_LABEL, ID_LABEL
|
|
18
|
+
from konsepy.importer import get_all_concepts
|
|
19
|
+
from konsepy.textio import iterate_csv_file
|
|
20
|
+
from konsepy.types import RegexDict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def format_for_spacy(row_iter):
|
|
24
|
+
for count, studyid, note_id, note_date, text in row_iter:
|
|
25
|
+
yield text, (count, studyid, note_id, note_date)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_pipeline(sentence_model, spacy_model='en_core_web_sm'):
|
|
29
|
+
if sentence_model == 'senter':
|
|
30
|
+
nlp = spacy.load(spacy_model,
|
|
31
|
+
disable=('tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'ner', 'parser'))
|
|
32
|
+
nlp.enable_pipe('senter')
|
|
33
|
+
elif sentence_model == 'parser':
|
|
34
|
+
nlp = spacy.load(spacy_model, disable=('tagger', 'attribute_ruler', 'lemmatizer', 'ner'))
|
|
35
|
+
else:
|
|
36
|
+
raise ValueError(f'Unrecognized `sentence_model`: {sentence_model}.')
|
|
37
|
+
return nlp
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def build_regex_dict(package_name):
|
|
41
|
+
if not package_name:
|
|
42
|
+
raise ValueError('Specify name of package.')
|
|
43
|
+
return {concept.name: concept.run_func for concept in get_all_concepts(package_name)}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_bio_tags(input_files, outdir: Path, *, package_name: str = None, regexes: RegexDict = None,
|
|
47
|
+
sentence_model='senter',
|
|
48
|
+
id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
|
|
49
|
+
notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL):
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
outdir.mkdir(exist_ok=True)
|
|
54
|
+
|
|
55
|
+
# prepare sentence splitter
|
|
56
|
+
nlp = get_pipeline(sentence_model)
|
|
57
|
+
|
|
58
|
+
regexes = regexes or build_regex_dict(package_name)
|
|
59
|
+
|
|
60
|
+
with (
|
|
61
|
+
open(outdir / 'bio_tag_data.csv', 'w', newline='') as out,
|
|
62
|
+
open(outdir / 'bio_tag_data.jsonl', 'w') as jsonl,
|
|
63
|
+
):
|
|
64
|
+
writer = csv.DictWriter(
|
|
65
|
+
out,
|
|
66
|
+
fieldnames=['index', 'studyid', 'note_id', 'sentence_id', 'domain', 'category', 'capture', 'start', 'end']
|
|
67
|
+
)
|
|
68
|
+
writer.writeheader()
|
|
69
|
+
i = 0
|
|
70
|
+
for doc, (count, studyid, note_id, note_date) in nlp.pipe(format_for_spacy(
|
|
71
|
+
iterate_csv_file(
|
|
72
|
+
input_files, id_label=id_label, noteid_label=noteid_label, notedate_label=notedate_label,
|
|
73
|
+
notetext_label=notetext_label,
|
|
74
|
+
)
|
|
75
|
+
), as_tuples=True):
|
|
76
|
+
constant_meta = {
|
|
77
|
+
'studyid': studyid,
|
|
78
|
+
'note_id': note_id,
|
|
79
|
+
}
|
|
80
|
+
for sent_id, sentence in enumerate(doc.sents):
|
|
81
|
+
sentence = str(sentence)
|
|
82
|
+
curr_note = { # records for this text note
|
|
83
|
+
'results': [],
|
|
84
|
+
'text': sentence,
|
|
85
|
+
'sentence_id': sent_id,
|
|
86
|
+
}
|
|
87
|
+
for domain, regex_func in regexes.items():
|
|
88
|
+
for category, capture, start, end in regex_func(sentence):
|
|
89
|
+
data = {
|
|
90
|
+
'index': i,
|
|
91
|
+
'domain': domain,
|
|
92
|
+
'category': category,
|
|
93
|
+
'capture': capture,
|
|
94
|
+
'start': start,
|
|
95
|
+
'end': end,
|
|
96
|
+
}
|
|
97
|
+
writer.writerow(data | constant_meta | {'sentence_id': sent_id})
|
|
98
|
+
curr_note['results'].append(data)
|
|
99
|
+
i += 1
|
|
100
|
+
# write 1 line per input sentence
|
|
101
|
+
curr_note['results'] = sorted(curr_note['results'], key=lambda x: x['start'])
|
|
102
|
+
jsonl.write(json.dumps(constant_meta | curr_note) + '\n')
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
if __name__ == '__main__':
|
|
106
|
+
get_bio_tags(**vars(add_outdir_and_infiles().parse_args()))
|
konsepy/cli.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from konsepy.constants import NOTETEXT_LABEL, NOTEDATE_LABEL, NOTEID_LABEL, ID_LABEL
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def concept_cli(func):
|
|
8
|
+
parser = argparse.ArgumentParser(fromfile_prefix_chars='@!')
|
|
9
|
+
add_common_cli(parser)
|
|
10
|
+
func(**vars(parser.parse_args()))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def snippet_cli():
|
|
14
|
+
parser = argparse.ArgumentParser(fromfile_prefix_chars='@!')
|
|
15
|
+
parser.add_argument('--concept-name', dest='concept_name', required=True,
|
|
16
|
+
help='Name of concept to run regexes for.')
|
|
17
|
+
parser.add_argument('--regexes', nargs='+',
|
|
18
|
+
help=r'REGEX_NAME==(?:re(?:gex)\sto\s(?:search|look)\sfor')
|
|
19
|
+
parser.add_argument('--stop-after-regex-count', dest='stop_after_regex_count', default=None,
|
|
20
|
+
help='change to number if you want to limit number of regex "hits"; else keep None')
|
|
21
|
+
add_common_cli(parser)
|
|
22
|
+
return vars(parser.parse_args())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def add_common_cli(parser: argparse.ArgumentParser):
|
|
26
|
+
add_outdir_and_infiles(parser)
|
|
27
|
+
parser.add_argument('--require-regex', default=None,
|
|
28
|
+
help='Output text containing this regex but in which no regexes were found.')
|
|
29
|
+
parser.add_argument('--start-after', default=0, type=int,
|
|
30
|
+
help='Start after skipping this many records')
|
|
31
|
+
parser.add_argument('--stop-after', default=None, type=int,
|
|
32
|
+
help='change to number if you want to limit number of notes searched through; else None.')
|
|
33
|
+
parser.add_argument('--select-probability', default=1.0, type=float,
|
|
34
|
+
help='Set to less than 1.0 to increase note sample (e.g., 0.3); 1.0=don\'t skip anything')
|
|
35
|
+
parser.add_argument('--window-size', default=50, type=int,
|
|
36
|
+
help='Change the window for the pre/post contexts')
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def add_outdir_and_infiles(parser: argparse.ArgumentParser = None):
|
|
40
|
+
if not parser:
|
|
41
|
+
parser = argparse.ArgumentParser(fromfile_prefix_chars='@!')
|
|
42
|
+
parser.add_argument('--outdir', type=Path, default=Path('.'),
|
|
43
|
+
help='Directory to place output files.')
|
|
44
|
+
parser.add_argument('--input-files', nargs='+', type=str, default=list(),
|
|
45
|
+
help='Input CSV or SAS file(s) to read.')
|
|
46
|
+
parser.add_argument('--id-label', default=ID_LABEL,
|
|
47
|
+
help='Column label for individual id')
|
|
48
|
+
parser.add_argument('--noteid-label', default=NOTEID_LABEL,
|
|
49
|
+
help='Column label for individual id')
|
|
50
|
+
parser.add_argument('--notedate-label', default=NOTEDATE_LABEL,
|
|
51
|
+
help='Column label for individual id')
|
|
52
|
+
parser.add_argument('--notetext-label', default=NOTETEXT_LABEL,
|
|
53
|
+
help='Column label for individual id')
|
|
54
|
+
return parser
|
konsepy/constants.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import datetime
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
from konsepy.cli import snippet_cli
|
|
8
|
+
from konsepy.constants import NOTEDATE_LABEL, ID_LABEL, NOTEID_LABEL, NOTETEXT_LABEL
|
|
9
|
+
from konsepy.importer import get_all_concepts
|
|
10
|
+
from konsepy.textio import iterate_csv_file
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_text_snippets(input_files, outdir, regexes, *, start_after=0, stop_after=None, window_size=50,
|
|
14
|
+
id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
|
|
15
|
+
notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
|
|
16
|
+
select_probability=1.0, label='snippets', stop_after_regex_count=None):
|
|
17
|
+
dt = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
18
|
+
logger.warning('Snippets will have spaces normalized:'
|
|
19
|
+
' multiple spaces/newlines/tabs will be converted'
|
|
20
|
+
' to a single space in the output.')
|
|
21
|
+
rx_count = 0
|
|
22
|
+
outdir.mkdir(exist_ok=True)
|
|
23
|
+
with open(outdir / f'{label}_{dt}.csv', 'w', newline='') as out:
|
|
24
|
+
writer = csv.writer(out)
|
|
25
|
+
writer.writerow(['id', 'studyid', 'note_id', 'date', 'regex_name', 'precontext', 'term', 'postcontext'])
|
|
26
|
+
for _, studyid, note_id, note_date, text in iterate_csv_file(
|
|
27
|
+
input_files, start_after=start_after, stop_after=stop_after,
|
|
28
|
+
id_label=id_label, noteid_label=noteid_label,
|
|
29
|
+
notetext_label=notetext_label, notedate_label=notedate_label,
|
|
30
|
+
select_probability=select_probability
|
|
31
|
+
):
|
|
32
|
+
text = ' '.join(text.split()) # remove newlines, etc. (bad for snippets in Excel)
|
|
33
|
+
for name, regex in regexes:
|
|
34
|
+
if isinstance(regex, str):
|
|
35
|
+
regex = re.compile(regex, re.I)
|
|
36
|
+
for m in regex.finditer(text):
|
|
37
|
+
precontext = text[max(m.start() - window_size, 0):m.start()]
|
|
38
|
+
postcontext = text[m.end():m.end() + window_size]
|
|
39
|
+
writer.writerow([
|
|
40
|
+
rx_count, # id
|
|
41
|
+
studyid,
|
|
42
|
+
note_id,
|
|
43
|
+
note_date,
|
|
44
|
+
name,
|
|
45
|
+
precontext,
|
|
46
|
+
m.group(), # term
|
|
47
|
+
postcontext,
|
|
48
|
+
])
|
|
49
|
+
rx_count += 1
|
|
50
|
+
if stop_after_regex_count and rx_count >= stop_after_regex_count:
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_text_snippets_for_concept_algorithm(package, input_files, outdir, *, concept_name=None, start_after=0,
|
|
55
|
+
stop_after=None,
|
|
56
|
+
window_size=50, regexes=None,
|
|
57
|
+
id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
|
|
58
|
+
notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
|
|
59
|
+
select_probability=1.0, label='snippets', stop_after_regex_count=None,
|
|
60
|
+
**kwargs):
|
|
61
|
+
regexes = [(regex, category)
|
|
62
|
+
for concept in get_all_concepts(package, concept_name)
|
|
63
|
+
for regex, category in concept.regexes]
|
|
64
|
+
|
|
65
|
+
get_text_snippets(input_files, outdir, regexes,
|
|
66
|
+
start_after=start_after,
|
|
67
|
+
stop_after=stop_after,
|
|
68
|
+
window_size=window_size,
|
|
69
|
+
select_probability=select_probability,
|
|
70
|
+
id_label=id_label,
|
|
71
|
+
noteid_label=noteid_label,
|
|
72
|
+
notedate_label=notedate_label,
|
|
73
|
+
notetext_label=notetext_label,
|
|
74
|
+
label=label,
|
|
75
|
+
stop_after_regex_count=stop_after_regex_count,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
if __name__ == '__main__':
|
|
80
|
+
kwargs = snippet_cli()
|
|
81
|
+
kwargs['label'] = kwargs.get('concept_name', 'concept')
|
|
82
|
+
|
|
83
|
+
if kwargs.get('concept_name', None):
|
|
84
|
+
get_text_snippets_for_concept_algorithm(**kwargs)
|
|
85
|
+
else:
|
|
86
|
+
get_text_snippets(**kwargs)
|
konsepy/importer.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import pkgutil
|
|
3
|
+
from enum import EnumType
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ConceptImport:
|
|
10
|
+
|
|
11
|
+
def __init__(self, module_info, package_name):
|
|
12
|
+
self.name = module_info.name
|
|
13
|
+
self.imp = importlib.import_module(f'{package_name}.concepts.{self.name}')
|
|
14
|
+
self.category_enum = self._get_category()
|
|
15
|
+
self.run_func = self.imp.RUN_REGEXES_FUNC
|
|
16
|
+
self.regexes = self.imp.REGEXES
|
|
17
|
+
|
|
18
|
+
def run(self, sentence):
|
|
19
|
+
self.run_func(sentence)
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def domain(self):
|
|
23
|
+
return self.name
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def categories(self):
|
|
27
|
+
return [category.name for category in self.category_enum]
|
|
28
|
+
|
|
29
|
+
def _get_category(self):
|
|
30
|
+
for name, value in self.imp.__dict__.items():
|
|
31
|
+
if isinstance(value, EnumType):
|
|
32
|
+
return value
|
|
33
|
+
raise ValueError(f'Unable to identify category enum for concept "{self.name}".')
|
|
34
|
+
|
|
35
|
+
def __str__(self):
|
|
36
|
+
return f'ConceptImport<{self.name}>'
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_all_concepts(package_name: str, *concepts):
|
|
40
|
+
imp = importlib.import_module(f'{package_name}.concepts')
|
|
41
|
+
path = Path(imp.__file__).parent
|
|
42
|
+
for module_info in pkgutil.iter_modules([path]):
|
|
43
|
+
if concepts and module_info.name not in concepts:
|
|
44
|
+
continue # look for only requested concepts if any supplied
|
|
45
|
+
try:
|
|
46
|
+
yield ConceptImport(module_info, package_name)
|
|
47
|
+
except ValueError as ve:
|
|
48
|
+
logger.warning(f'Failed to load concept: {package_name}.concepts.{module_info.name}')
|
|
49
|
+
logger.exception(ve)
|
konsepy/regex.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import re
|
|
3
|
+
from collections import Counter, defaultdict
|
|
4
|
+
|
|
5
|
+
from konsepy.constants import NOTEDATE_LABEL, ID_LABEL, NOTEID_LABEL, NOTETEXT_LABEL
|
|
6
|
+
from konsepy.textio import iterate_csv_file, output_results
|
|
7
|
+
from loguru import logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def run_regex_on_files(input_files, regex_func, *, start_after=0, stop_after=None,
|
|
11
|
+
require_regex=None, window_size=50,
|
|
12
|
+
id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
|
|
13
|
+
notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
|
|
14
|
+
select_probability=1.0):
|
|
15
|
+
count = 0 # default value; received from forloop below
|
|
16
|
+
cat_counter_notes = Counter()
|
|
17
|
+
cat_counter_mrns = defaultdict(set)
|
|
18
|
+
noteid_to_cat = defaultdict(Counter)
|
|
19
|
+
mrn_to_cat = defaultdict(Counter)
|
|
20
|
+
unique_mrns = set()
|
|
21
|
+
not_found_text = Counter()
|
|
22
|
+
if require_regex:
|
|
23
|
+
require_regex = re.compile(require_regex, re.I)
|
|
24
|
+
for count, mrn, note_id, note_date, text in iterate_csv_file(
|
|
25
|
+
input_files, start_after=start_after, stop_after=stop_after,
|
|
26
|
+
id_label=id_label, noteid_label=noteid_label,
|
|
27
|
+
notedate_label=notedate_label, notetext_label=notetext_label,
|
|
28
|
+
select_probability=select_probability
|
|
29
|
+
):
|
|
30
|
+
if count % 10000 == 0:
|
|
31
|
+
logger.info(
|
|
32
|
+
f'Completed {count} records: {len(unique_mrns)} MRNs contain any category ({datetime.datetime.now()})')
|
|
33
|
+
extract_categories(
|
|
34
|
+
mrn, note_id, text, regex_func,
|
|
35
|
+
cat_counter_mrns=cat_counter_mrns, cat_counter_notes=cat_counter_notes,
|
|
36
|
+
mrn_to_cat=mrn_to_cat, require_regex=require_regex,
|
|
37
|
+
not_found_text=not_found_text, noteid_to_cat=noteid_to_cat,
|
|
38
|
+
unique_mrns=unique_mrns, window_size=window_size
|
|
39
|
+
)
|
|
40
|
+
logger.info(f'Finished. Total records: {count} ({datetime.datetime.now()})')
|
|
41
|
+
return cat_counter_notes, cat_counter_mrns, not_found_text, mrn_to_cat, noteid_to_cat
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def extract_categories(mrn, note_id, text, regex_func, *,
|
|
45
|
+
cat_counter_mrns=None, cat_counter_notes=None, mrn_to_cat=None,
|
|
46
|
+
not_found_text=None, noteid_to_cat=None,
|
|
47
|
+
require_regex=None, unique_mrns=None, window_size=50):
|
|
48
|
+
categories = list(regex_func(text))
|
|
49
|
+
for category in categories:
|
|
50
|
+
mrn_to_cat[mrn][category] += 1
|
|
51
|
+
noteid_to_cat[(mrn, note_id)][category] += 1
|
|
52
|
+
cat_counter_notes[category] += 1
|
|
53
|
+
cat_counter_mrns[category].add(mrn)
|
|
54
|
+
if categories:
|
|
55
|
+
unique_mrns.add(mrn)
|
|
56
|
+
if not categories and not_found_text:
|
|
57
|
+
if require_regex:
|
|
58
|
+
for m in require_regex.finditer(text):
|
|
59
|
+
start_snippet = max(0, m.start() - window_size)
|
|
60
|
+
end_snippet = m.end() + window_size
|
|
61
|
+
snippet = text[start_snippet:end_snippet + 1]
|
|
62
|
+
not_found_text[' '.join(snippet.split())] += 1
|
|
63
|
+
else:
|
|
64
|
+
not_found_text[' '.join(text.split())] += 1
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def run_regex_and_output(name, input_files, outdir, regex_func, *category_enums,
|
|
68
|
+
start_after=0, stop_after=None, require_regex=None, window_size=50,
|
|
69
|
+
id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
|
|
70
|
+
notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
|
|
71
|
+
select_probability=1.0):
|
|
72
|
+
dt = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
73
|
+
curr_outdir = outdir / f'{name}_{dt}'
|
|
74
|
+
curr_outdir.mkdir(parents=True)
|
|
75
|
+
logger.add(curr_outdir / f'{name}_{dt}.log')
|
|
76
|
+
note_counter, cat_counter_mrns, not_found_text, mrn_to_cat, note_to_cat = run_regex_on_files(
|
|
77
|
+
input_files, regex_func, start_after=start_after, stop_after=stop_after, require_regex=require_regex,
|
|
78
|
+
window_size=window_size,
|
|
79
|
+
id_label=id_label, noteid_label=noteid_label,
|
|
80
|
+
notedate_label=notedate_label, notetext_label=notetext_label,
|
|
81
|
+
select_probability=select_probability
|
|
82
|
+
)
|
|
83
|
+
output_results(curr_outdir, not_found_text=not_found_text, note_counter=note_counter,
|
|
84
|
+
cat_counter_mrns=cat_counter_mrns, category_enums=category_enums,
|
|
85
|
+
note_to_cat=note_to_cat, mrn_to_cat=mrn_to_cat)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def search_first_regex(regexes):
|
|
89
|
+
"""For each regex, only return first instance (use search)"""
|
|
90
|
+
|
|
91
|
+
def _search_first_regex(text):
|
|
92
|
+
for regex, category in regexes:
|
|
93
|
+
if regex.search(text):
|
|
94
|
+
yield category
|
|
95
|
+
|
|
96
|
+
return _search_first_regex
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def search_all_regex(regexes):
|
|
100
|
+
"""For each regex, return all (use finditer)"""
|
|
101
|
+
|
|
102
|
+
def _search_all_regex(text):
|
|
103
|
+
for regex, category in regexes:
|
|
104
|
+
for _ in regex.finditer(text):
|
|
105
|
+
yield category
|
|
106
|
+
|
|
107
|
+
return _search_all_regex
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_all_regex_by_index(regexes):
|
|
111
|
+
"""For each regex, return all results, including indices"""
|
|
112
|
+
|
|
113
|
+
def _get_all_regex_by_index(text):
|
|
114
|
+
for regex, category in regexes:
|
|
115
|
+
for m in regex.finditer(text):
|
|
116
|
+
yield category.name, m.group(), m.start(), m.end()
|
|
117
|
+
|
|
118
|
+
return _get_all_regex_by_index
|
konsepy/run_all.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import pathlib
|
|
3
|
+
from collections import Counter, defaultdict
|
|
4
|
+
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
from konsepy.cli import add_outdir_and_infiles
|
|
8
|
+
from konsepy.constants import NOTEDATE_LABEL, ID_LABEL, NOTEID_LABEL, NOTETEXT_LABEL
|
|
9
|
+
from konsepy.importer import get_all_concepts
|
|
10
|
+
from konsepy.regex import extract_categories
|
|
11
|
+
from konsepy.textio import iterate_csv_file, output_results
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_all(input_files, outdir: pathlib.Path, package_name: str, *,
|
|
15
|
+
id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
|
|
16
|
+
notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
|
|
17
|
+
):
|
|
18
|
+
dt = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
19
|
+
curr_outdir = outdir / f'run_all_{dt}'
|
|
20
|
+
curr_outdir.mkdir(parents=True)
|
|
21
|
+
logger.add(curr_outdir / f'run_all_{dt}.log')
|
|
22
|
+
count = 0 # default value; received from forloop below
|
|
23
|
+
cat_counter_notes = Counter()
|
|
24
|
+
cat_counter_mrns = defaultdict(set)
|
|
25
|
+
noteid_to_cat = defaultdict(Counter)
|
|
26
|
+
mrn_to_cat = defaultdict(Counter)
|
|
27
|
+
unique_mrns = set()
|
|
28
|
+
concepts = list(get_all_concepts(package_name))
|
|
29
|
+
logger.info(f'Loaded {len(concepts)} concepts for processing.')
|
|
30
|
+
for count, studyid, note_id, note_date, text in iterate_csv_file(
|
|
31
|
+
input_files,
|
|
32
|
+
id_label=id_label, noteid_label=noteid_label,
|
|
33
|
+
notedate_label=notedate_label, notetext_label=notetext_label,
|
|
34
|
+
):
|
|
35
|
+
if count % 10000 == 0:
|
|
36
|
+
logger.info(f'Completed {count} records for {len(unique_mrns)} MRNs ({datetime.datetime.now()})')
|
|
37
|
+
|
|
38
|
+
for concept in concepts:
|
|
39
|
+
extract_categories(
|
|
40
|
+
studyid, note_id, text, concept.run_func,
|
|
41
|
+
cat_counter_mrns=cat_counter_mrns, cat_counter_notes=cat_counter_notes,
|
|
42
|
+
mrn_to_cat=mrn_to_cat, noteid_to_cat=noteid_to_cat,
|
|
43
|
+
unique_mrns=unique_mrns
|
|
44
|
+
)
|
|
45
|
+
logger.info(f'Finished. Total records: {count} ({datetime.datetime.now()})')
|
|
46
|
+
output_results(curr_outdir, note_counter=cat_counter_notes,
|
|
47
|
+
cat_counter_mrns=cat_counter_mrns,
|
|
48
|
+
category_enums=[c.category_enum for c in concepts],
|
|
49
|
+
note_to_cat=noteid_to_cat, mrn_to_cat=mrn_to_cat)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == '__main__':
|
|
53
|
+
import argparse
|
|
54
|
+
|
|
55
|
+
parser = argparse.ArgumentParser(fromfile_prefix_chars='@!')
|
|
56
|
+
add_outdir_and_infiles(parser)
|
|
57
|
+
run_all(**vars(parser.parse_args()))
|
konsepy/rxutils.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
class RxType(type):
|
|
2
|
+
MAPPING = {
|
|
3
|
+
'p': '.',
|
|
4
|
+
'w': r'\w',
|
|
5
|
+
'W': r'\W',
|
|
6
|
+
'S': r'\S',
|
|
7
|
+
's': r'\s',
|
|
8
|
+
'o': r'(?:\w+\W*)', # word
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
def __getattr__(cls, item):
|
|
12
|
+
return cls._parse_item(item)
|
|
13
|
+
|
|
14
|
+
def _parse_item(cls, element):
|
|
15
|
+
el = cls.MAPPING[element[0]]
|
|
16
|
+
if len(element) > 1:
|
|
17
|
+
return el + cls._parse_numbers(*element[1:].split('_'))
|
|
18
|
+
return el
|
|
19
|
+
|
|
20
|
+
def _parse_numbers(cls, *nums):
|
|
21
|
+
if len(nums) == 1:
|
|
22
|
+
v1 = 0
|
|
23
|
+
v2 = nums[0]
|
|
24
|
+
else:
|
|
25
|
+
v1, v2 = nums
|
|
26
|
+
return rf'{{{v1},{v2}}}'
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Rx(metaclass=RxType):
|
|
30
|
+
pass
|
konsepy/textio.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simplify reading input files by creating an iterating wrapper.
|
|
3
|
+
"""
|
|
4
|
+
import csv
|
|
5
|
+
import random
|
|
6
|
+
|
|
7
|
+
from loguru import logger
|
|
8
|
+
from sas7bdat import SAS7BDAT
|
|
9
|
+
|
|
10
|
+
from konsepy.constants import NOTEDATE_LABEL, ID_LABEL, NOTEID_LABEL, NOTETEXT_LABEL
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def iterate_csv_file(input_files, *, start_after=0, stop_after=None,
|
|
14
|
+
id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
|
|
15
|
+
notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
|
|
16
|
+
select_probability=1.0, encoding='latin1'):
|
|
17
|
+
"""Return count, mrn, note_id, text for each row in csv file"""
|
|
18
|
+
count = 0
|
|
19
|
+
total_count = 0
|
|
20
|
+
for input_file in input_files:
|
|
21
|
+
func = _extract_sas_file if input_file.endswith('sas7bdat') else _extract_csv_file
|
|
22
|
+
for mrn, text, note_id, date in func(input_file, encoding, id_label, noteid_label, notedate_label,
|
|
23
|
+
notetext_label):
|
|
24
|
+
if random.random() > select_probability:
|
|
25
|
+
continue
|
|
26
|
+
total_count += 1
|
|
27
|
+
if start_after >= total_count:
|
|
28
|
+
continue
|
|
29
|
+
count += 1
|
|
30
|
+
yield count, mrn, note_id, date, text
|
|
31
|
+
if stop_after and count > stop_after:
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _extract_sas_file(input_file, encoding, id_label, noteid_label, notedate_label, notetext_label):
|
|
36
|
+
with SAS7BDAT(input_file, skip_header=False, encoding=encoding) as fh:
|
|
37
|
+
header = []
|
|
38
|
+
for row in fh:
|
|
39
|
+
if not header:
|
|
40
|
+
header = row
|
|
41
|
+
continue
|
|
42
|
+
mrn = row[header.index(id_label)]
|
|
43
|
+
date = row[header.index(notedate_label)] if notedate_label else ''
|
|
44
|
+
text = row[header.index(noteid_label)]
|
|
45
|
+
noteid = row[header.index(notetext_label)]
|
|
46
|
+
yield mrn, text, noteid, date
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _extract_csv_file(input_file, encoding, id_label, noteid_label, notedate_label, notetext_label):
|
|
50
|
+
with open(input_file, encoding=encoding) as fh:
|
|
51
|
+
for row in csv.DictReader(fh):
|
|
52
|
+
text = row[notetext_label]
|
|
53
|
+
mrn = row[id_label]
|
|
54
|
+
date = row.get(notedate_label, '')
|
|
55
|
+
note_id = row[noteid_label]
|
|
56
|
+
yield mrn, text, note_id, date
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def output_results(outdir, *, not_found_text=None,
|
|
60
|
+
note_counter=None, cat_counter_mrns=None,
|
|
61
|
+
category_enums=None, note_to_cat=None, mrn_to_cat=None):
|
|
62
|
+
categories = [e for category_enum in category_enums for e in category_enum]
|
|
63
|
+
if not_found_text is not None:
|
|
64
|
+
with open(outdir / 'snippets.csv', 'w', newline='') as out:
|
|
65
|
+
writer = csv.writer(out)
|
|
66
|
+
writer.writerow(['count', 'snippet'])
|
|
67
|
+
for snippet, count in not_found_text.most_common():
|
|
68
|
+
writer.writerow([count, ' '.join(snippet.split())])
|
|
69
|
+
|
|
70
|
+
with open(outdir / 'category_counts.csv', 'w', newline='') as out:
|
|
71
|
+
writer = csv.writer(out)
|
|
72
|
+
writer.writerow(['category', 'note_count', 'mrn_count'])
|
|
73
|
+
for cat in categories:
|
|
74
|
+
writer.writerow([cat, note_counter[cat], len(cat_counter_mrns[cat])])
|
|
75
|
+
|
|
76
|
+
with open(outdir / 'mrn_category_counts.csv', 'w', newline='') as out:
|
|
77
|
+
writer = csv.DictWriter(out, ['mrn'] + categories)
|
|
78
|
+
writer.writeheader()
|
|
79
|
+
for mrn, note_counter in mrn_to_cat.items():
|
|
80
|
+
writer.writerow({'mrn': mrn} | dict(note_counter))
|
|
81
|
+
|
|
82
|
+
with open(outdir / 'notes_category_counts.csv', 'w', newline='') as out:
|
|
83
|
+
writer = csv.DictWriter(out, ['mrn', 'note_id'] + categories)
|
|
84
|
+
writer.writeheader()
|
|
85
|
+
for (mrn, note), note_counter in note_to_cat.items():
|
|
86
|
+
writer.writerow({'mrn': mrn, 'note_id': note} | dict(note_counter))
|
|
87
|
+
logger.info(f'Unique MRNs: {len(mrn_to_cat)}')
|
konsepy/types.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: konsepy
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Framework for build NLP information extraction systems using regular expressions.
|
|
5
|
+
Keywords: nlp
|
|
6
|
+
Author-email: dcronkite <dcronkite+pypi@gmail.com>
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Intended Audience :: Healthcare Industry
|
|
17
|
+
Requires-Dist: loguru
|
|
18
|
+
Requires-Dist: pytest
|
|
19
|
+
Requires-Dist: sas7bdat
|
|
20
|
+
Requires-Dist: spacy ; extra == "ssplit"
|
|
21
|
+
Project-URL: Home, https://github.com/kpwhri/konsepy
|
|
22
|
+
Provides-Extra: ssplit
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# konsepy
|
|
26
|
+
|
|
27
|
+
Framework for build NLP information extraction systems using regular expressions.
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
For now, find documentation for this library (and a template to download) from https://github.com/kpwhri/konsepy.
|
|
32
|
+
|
|
33
|
+
* Download the template
|
|
34
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
konsepy/__init__.py,sha256=VfLugWKDQ_0IMhBVncNB6hmBljR4-v3bwKh0N5PSfKw,112
|
|
2
|
+
konsepy/bio_tag.py,sha256=sv1aIgO26Sw0bShUDVVDgWkCRF9nP485ChTCs5qeROY,4059
|
|
3
|
+
konsepy/cli.py,sha256=3avT_aMFapWFWu-33nNv7SPqulWEOk7SrsNogRQd9SY,2817
|
|
4
|
+
konsepy/constants.py,sha256=1zPrlq1fHHOLDtE1cQNS0C9BEiWdqJbZDI_Bb3n0wrw,108
|
|
5
|
+
konsepy/get_text_snippets.py,sha256=aAO-gdsm4X6I0jv1eALd3DViy_q9XqOBcK7j72t0_Qc,4111
|
|
6
|
+
konsepy/importer.py,sha256=B3gIaSsIprrM9mWsbqPHFQC_vtg6yQUKJow2cEiMhKI,1613
|
|
7
|
+
konsepy/regex.py,sha256=Ik78Gf3pV_hNH_XmhggyzMGwIx1pwfWkaqgonWn1Smo,5204
|
|
8
|
+
konsepy/run_all.py,sha256=jOemBSQPiHNF2o0zbUYciX_Y6EwiTikyKA5Z9pe0YA4,2447
|
|
9
|
+
konsepy/rxutils.py,sha256=NLOHL_VPcPc57Ve5X-RMCcNSgPtsGXPbSzYRMFVhg6M,684
|
|
10
|
+
konsepy/textio.py,sha256=G5DyWLlgkDbyaXvG_uu0ahuX_SZee7giosmX1vChT2M,3825
|
|
11
|
+
konsepy/types.py,sha256=xquwcA0PbW2biyL_5i8W7747Bl1pv9Kh7hLzIuxRjNc,211
|
|
12
|
+
konsepy-0.0.1.dist-info/WHEEL,sha256=rSgq_JpHF9fHR1lx53qwg_1-2LypZE_qmcuXbVUq948,81
|
|
13
|
+
konsepy-0.0.1.dist-info/METADATA,sha256=cqowIiSjninoCe3ZcK2HUCfPm52lDF-cYharqKhKbgU,1121
|
|
14
|
+
konsepy-0.0.1.dist-info/RECORD,,
|