konsepy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konsepy/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ """Framework for build NLP information extraction systems using regular expressions."""
2
+ __version__ = '0.0.1'
konsepy/bio_tag.py ADDED
@@ -0,0 +1,106 @@
1
+ """
2
+ Usage: python bio_tag.py --outdir /path/to/output --infiles infile.csv infile.sas7bdat
3
+
4
+ Post-processing:
5
+ * Identify which `count`/index values should be removed based on noise rules
6
+ * Remove these from the output jsonl file
7
+
8
+ """
9
+
10
+ import csv
11
+ import json
12
+ from pathlib import Path
13
+
14
+ import spacy
15
+
16
+ from konsepy.cli import add_outdir_and_infiles
17
+ from konsepy.constants import NOTETEXT_LABEL, NOTEDATE_LABEL, NOTEID_LABEL, ID_LABEL
18
+ from konsepy.importer import get_all_concepts
19
+ from konsepy.textio import iterate_csv_file
20
+ from konsepy.types import RegexDict
21
+
22
+
23
+ def format_for_spacy(row_iter):
24
+ for count, studyid, note_id, note_date, text in row_iter:
25
+ yield text, (count, studyid, note_id, note_date)
26
+
27
+
28
+ def get_pipeline(sentence_model, spacy_model='en_core_web_sm'):
29
+ if sentence_model == 'senter':
30
+ nlp = spacy.load(spacy_model,
31
+ disable=('tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'ner', 'parser'))
32
+ nlp.enable_pipe('senter')
33
+ elif sentence_model == 'parser':
34
+ nlp = spacy.load(spacy_model, disable=('tagger', 'attribute_ruler', 'lemmatizer', 'ner'))
35
+ else:
36
+ raise ValueError(f'Unrecognized `sentence_model`: {sentence_model}.')
37
+ return nlp
38
+
39
+
40
+ def build_regex_dict(package_name):
41
+ if not package_name:
42
+ raise ValueError('Specify name of package.')
43
+ return {concept.name: concept.run_func for concept in get_all_concepts(package_name)}
44
+
45
+
46
+ def get_bio_tags(input_files, outdir: Path, *, package_name: str = None, regexes: RegexDict = None,
47
+ sentence_model='senter',
48
+ id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
49
+ notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL):
50
+ """
51
+
52
+ """
53
+ outdir.mkdir(exist_ok=True)
54
+
55
+ # prepare sentence splitter
56
+ nlp = get_pipeline(sentence_model)
57
+
58
+ regexes = regexes or build_regex_dict(package_name)
59
+
60
+ with (
61
+ open(outdir / 'bio_tag_data.csv', 'w', newline='') as out,
62
+ open(outdir / 'bio_tag_data.jsonl', 'w') as jsonl,
63
+ ):
64
+ writer = csv.DictWriter(
65
+ out,
66
+ fieldnames=['index', 'studyid', 'note_id', 'sentence_id', 'domain', 'category', 'capture', 'start', 'end']
67
+ )
68
+ writer.writeheader()
69
+ i = 0
70
+ for doc, (count, studyid, note_id, note_date) in nlp.pipe(format_for_spacy(
71
+ iterate_csv_file(
72
+ input_files, id_label=id_label, noteid_label=noteid_label, notedate_label=notedate_label,
73
+ notetext_label=notetext_label,
74
+ )
75
+ ), as_tuples=True):
76
+ constant_meta = {
77
+ 'studyid': studyid,
78
+ 'note_id': note_id,
79
+ }
80
+ for sent_id, sentence in enumerate(doc.sents):
81
+ sentence = str(sentence)
82
+ curr_note = { # records for this text note
83
+ 'results': [],
84
+ 'text': sentence,
85
+ 'sentence_id': sent_id,
86
+ }
87
+ for domain, regex_func in regexes.items():
88
+ for category, capture, start, end in regex_func(sentence):
89
+ data = {
90
+ 'index': i,
91
+ 'domain': domain,
92
+ 'category': category,
93
+ 'capture': capture,
94
+ 'start': start,
95
+ 'end': end,
96
+ }
97
+ writer.writerow(data | constant_meta | {'sentence_id': sent_id})
98
+ curr_note['results'].append(data)
99
+ i += 1
100
+ # write 1 line per input sentence
101
+ curr_note['results'] = sorted(curr_note['results'], key=lambda x: x['start'])
102
+ jsonl.write(json.dumps(constant_meta | curr_note) + '\n')
103
+
104
+
105
+ if __name__ == '__main__':
106
+ get_bio_tags(**vars(add_outdir_and_infiles().parse_args()))
konsepy/cli.py ADDED
@@ -0,0 +1,54 @@
1
+ import argparse
2
+ from pathlib import Path
3
+
4
+ from konsepy.constants import NOTETEXT_LABEL, NOTEDATE_LABEL, NOTEID_LABEL, ID_LABEL
5
+
6
+
7
+ def concept_cli(func):
8
+ parser = argparse.ArgumentParser(fromfile_prefix_chars='@!')
9
+ add_common_cli(parser)
10
+ func(**vars(parser.parse_args()))
11
+
12
+
13
+ def snippet_cli():
14
+ parser = argparse.ArgumentParser(fromfile_prefix_chars='@!')
15
+ parser.add_argument('--concept-name', dest='concept_name', required=True,
16
+ help='Name of concept to run regexes for.')
17
+ parser.add_argument('--regexes', nargs='+',
18
+ help=r'REGEX_NAME==(?:re(?:gex)\sto\s(?:search|look)\sfor')
19
+ parser.add_argument('--stop-after-regex-count', dest='stop_after_regex_count', default=None,
20
+ help='change to number if you want to limit number of regex "hits"; else keep None')
21
+ add_common_cli(parser)
22
+ return vars(parser.parse_args())
23
+
24
+
25
+ def add_common_cli(parser: argparse.ArgumentParser):
26
+ add_outdir_and_infiles(parser)
27
+ parser.add_argument('--require-regex', default=None,
28
+ help='Output text containing this regex but in which no regexes were found.')
29
+ parser.add_argument('--start-after', default=0, type=int,
30
+ help='Start after skipping this many records')
31
+ parser.add_argument('--stop-after', default=None, type=int,
32
+ help='change to number if you want to limit number of notes searched through; else None.')
33
+ parser.add_argument('--select-probability', default=1.0, type=float,
34
+ help='Set to less than 1.0 to increase note sample (e.g., 0.3); 1.0=don\'t skip anything')
35
+ parser.add_argument('--window-size', default=50, type=int,
36
+ help='Change the window for the pre/post contexts')
37
+
38
+
39
+ def add_outdir_and_infiles(parser: argparse.ArgumentParser = None):
40
+ if not parser:
41
+ parser = argparse.ArgumentParser(fromfile_prefix_chars='@!')
42
+ parser.add_argument('--outdir', type=Path, default=Path('.'),
43
+ help='Directory to place output files.')
44
+ parser.add_argument('--input-files', nargs='+', type=str, default=list(),
45
+ help='Input CSV or SAS file(s) to read.')
46
+ parser.add_argument('--id-label', default=ID_LABEL,
47
+ help='Column label for individual id')
48
+ parser.add_argument('--noteid-label', default=NOTEID_LABEL,
49
+ help='Column label for individual id')
50
+ parser.add_argument('--notedate-label', default=NOTEDATE_LABEL,
51
+ help='Column label for individual id')
52
+ parser.add_argument('--notetext-label', default=NOTETEXT_LABEL,
53
+ help='Column label for individual id')
54
+ return parser
konsepy/constants.py ADDED
@@ -0,0 +1,4 @@
1
+ ID_LABEL = 'studyid'
2
+ NOTEID_LABEL = 'note_id'
3
+ NOTEDATE_LABEL = 'note_date'
4
+ NOTETEXT_LABEL = 'note_text'
@@ -0,0 +1,86 @@
1
+ import csv
2
+ import datetime
3
+ import re
4
+
5
+ from loguru import logger
6
+
7
+ from konsepy.cli import snippet_cli
8
+ from konsepy.constants import NOTEDATE_LABEL, ID_LABEL, NOTEID_LABEL, NOTETEXT_LABEL
9
+ from konsepy.importer import get_all_concepts
10
+ from konsepy.textio import iterate_csv_file
11
+
12
+
13
+ def get_text_snippets(input_files, outdir, regexes, *, start_after=0, stop_after=None, window_size=50,
14
+ id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
15
+ notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
16
+ select_probability=1.0, label='snippets', stop_after_regex_count=None):
17
+ dt = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
18
+ logger.warning('Snippets will have spaces normalized:'
19
+ ' multiple spaces/newlines/tabs will be converted'
20
+ ' to a single space in the output.')
21
+ rx_count = 0
22
+ outdir.mkdir(exist_ok=True)
23
+ with open(outdir / f'{label}_{dt}.csv', 'w', newline='') as out:
24
+ writer = csv.writer(out)
25
+ writer.writerow(['id', 'studyid', 'note_id', 'date', 'regex_name', 'precontext', 'term', 'postcontext'])
26
+ for _, studyid, note_id, note_date, text in iterate_csv_file(
27
+ input_files, start_after=start_after, stop_after=stop_after,
28
+ id_label=id_label, noteid_label=noteid_label,
29
+ notetext_label=notetext_label, notedate_label=notedate_label,
30
+ select_probability=select_probability
31
+ ):
32
+ text = ' '.join(text.split()) # remove newlines, etc. (bad for snippets in Excel)
33
+ for name, regex in regexes:
34
+ if isinstance(regex, str):
35
+ regex = re.compile(regex, re.I)
36
+ for m in regex.finditer(text):
37
+ precontext = text[max(m.start() - window_size, 0):m.start()]
38
+ postcontext = text[m.end():m.end() + window_size]
39
+ writer.writerow([
40
+ rx_count, # id
41
+ studyid,
42
+ note_id,
43
+ note_date,
44
+ name,
45
+ precontext,
46
+ m.group(), # term
47
+ postcontext,
48
+ ])
49
+ rx_count += 1
50
+ if stop_after_regex_count and rx_count >= stop_after_regex_count:
51
+ return
52
+
53
+
54
+ def get_text_snippets_for_concept_algorithm(package, input_files, outdir, *, concept_name=None, start_after=0,
55
+ stop_after=None,
56
+ window_size=50, regexes=None,
57
+ id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
58
+ notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
59
+ select_probability=1.0, label='snippets', stop_after_regex_count=None,
60
+ **kwargs):
61
+ regexes = [(regex, category)
62
+ for concept in get_all_concepts(package, concept_name)
63
+ for regex, category in concept.regexes]
64
+
65
+ get_text_snippets(input_files, outdir, regexes,
66
+ start_after=start_after,
67
+ stop_after=stop_after,
68
+ window_size=window_size,
69
+ select_probability=select_probability,
70
+ id_label=id_label,
71
+ noteid_label=noteid_label,
72
+ notedate_label=notedate_label,
73
+ notetext_label=notetext_label,
74
+ label=label,
75
+ stop_after_regex_count=stop_after_regex_count,
76
+ )
77
+
78
+
79
+ if __name__ == '__main__':
80
+ kwargs = snippet_cli()
81
+ kwargs['label'] = kwargs.get('concept_name', 'concept')
82
+
83
+ if kwargs.get('concept_name', None):
84
+ get_text_snippets_for_concept_algorithm(**kwargs)
85
+ else:
86
+ get_text_snippets(**kwargs)
konsepy/importer.py ADDED
@@ -0,0 +1,49 @@
1
+ import importlib
2
+ import pkgutil
3
+ from enum import EnumType
4
+ from pathlib import Path
5
+
6
+ from loguru import logger
7
+
8
+
9
+ class ConceptImport:
10
+
11
+ def __init__(self, module_info, package_name):
12
+ self.name = module_info.name
13
+ self.imp = importlib.import_module(f'{package_name}.concepts.{self.name}')
14
+ self.category_enum = self._get_category()
15
+ self.run_func = self.imp.RUN_REGEXES_FUNC
16
+ self.regexes = self.imp.REGEXES
17
+
18
+ def run(self, sentence):
19
+ self.run_func(sentence)
20
+
21
+ @property
22
+ def domain(self):
23
+ return self.name
24
+
25
+ @property
26
+ def categories(self):
27
+ return [category.name for category in self.category_enum]
28
+
29
+ def _get_category(self):
30
+ for name, value in self.imp.__dict__.items():
31
+ if isinstance(value, EnumType):
32
+ return value
33
+ raise ValueError(f'Unable to identify category enum for concept "{self.name}".')
34
+
35
+ def __str__(self):
36
+ return f'ConceptImport<{self.name}>'
37
+
38
+
39
+ def get_all_concepts(package_name: str, *concepts):
40
+ imp = importlib.import_module(f'{package_name}.concepts')
41
+ path = Path(imp.__file__).parent
42
+ for module_info in pkgutil.iter_modules([path]):
43
+ if concepts and module_info.name not in concepts:
44
+ continue # look for only requested concepts if any supplied
45
+ try:
46
+ yield ConceptImport(module_info, package_name)
47
+ except ValueError as ve:
48
+ logger.warning(f'Failed to load concept: {package_name}.concepts.{module_info.name}')
49
+ logger.exception(ve)
konsepy/regex.py ADDED
@@ -0,0 +1,118 @@
1
+ import datetime
2
+ import re
3
+ from collections import Counter, defaultdict
4
+
5
+ from konsepy.constants import NOTEDATE_LABEL, ID_LABEL, NOTEID_LABEL, NOTETEXT_LABEL
6
+ from konsepy.textio import iterate_csv_file, output_results
7
+ from loguru import logger
8
+
9
+
10
+ def run_regex_on_files(input_files, regex_func, *, start_after=0, stop_after=None,
11
+ require_regex=None, window_size=50,
12
+ id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
13
+ notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
14
+ select_probability=1.0):
15
+ count = 0 # default value; received from forloop below
16
+ cat_counter_notes = Counter()
17
+ cat_counter_mrns = defaultdict(set)
18
+ noteid_to_cat = defaultdict(Counter)
19
+ mrn_to_cat = defaultdict(Counter)
20
+ unique_mrns = set()
21
+ not_found_text = Counter()
22
+ if require_regex:
23
+ require_regex = re.compile(require_regex, re.I)
24
+ for count, mrn, note_id, note_date, text in iterate_csv_file(
25
+ input_files, start_after=start_after, stop_after=stop_after,
26
+ id_label=id_label, noteid_label=noteid_label,
27
+ notedate_label=notedate_label, notetext_label=notetext_label,
28
+ select_probability=select_probability
29
+ ):
30
+ if count % 10000 == 0:
31
+ logger.info(
32
+ f'Completed {count} records: {len(unique_mrns)} MRNs contain any category ({datetime.datetime.now()})')
33
+ extract_categories(
34
+ mrn, note_id, text, regex_func,
35
+ cat_counter_mrns=cat_counter_mrns, cat_counter_notes=cat_counter_notes,
36
+ mrn_to_cat=mrn_to_cat, require_regex=require_regex,
37
+ not_found_text=not_found_text, noteid_to_cat=noteid_to_cat,
38
+ unique_mrns=unique_mrns, window_size=window_size
39
+ )
40
+ logger.info(f'Finished. Total records: {count} ({datetime.datetime.now()})')
41
+ return cat_counter_notes, cat_counter_mrns, not_found_text, mrn_to_cat, noteid_to_cat
42
+
43
+
44
+ def extract_categories(mrn, note_id, text, regex_func, *,
45
+ cat_counter_mrns=None, cat_counter_notes=None, mrn_to_cat=None,
46
+ not_found_text=None, noteid_to_cat=None,
47
+ require_regex=None, unique_mrns=None, window_size=50):
48
+ categories = list(regex_func(text))
49
+ for category in categories:
50
+ mrn_to_cat[mrn][category] += 1
51
+ noteid_to_cat[(mrn, note_id)][category] += 1
52
+ cat_counter_notes[category] += 1
53
+ cat_counter_mrns[category].add(mrn)
54
+ if categories:
55
+ unique_mrns.add(mrn)
56
+ if not categories and not_found_text:
57
+ if require_regex:
58
+ for m in require_regex.finditer(text):
59
+ start_snippet = max(0, m.start() - window_size)
60
+ end_snippet = m.end() + window_size
61
+ snippet = text[start_snippet:end_snippet + 1]
62
+ not_found_text[' '.join(snippet.split())] += 1
63
+ else:
64
+ not_found_text[' '.join(text.split())] += 1
65
+
66
+
67
+ def run_regex_and_output(name, input_files, outdir, regex_func, *category_enums,
68
+ start_after=0, stop_after=None, require_regex=None, window_size=50,
69
+ id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
70
+ notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
71
+ select_probability=1.0):
72
+ dt = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
73
+ curr_outdir = outdir / f'{name}_{dt}'
74
+ curr_outdir.mkdir(parents=True)
75
+ logger.add(curr_outdir / f'{name}_{dt}.log')
76
+ note_counter, cat_counter_mrns, not_found_text, mrn_to_cat, note_to_cat = run_regex_on_files(
77
+ input_files, regex_func, start_after=start_after, stop_after=stop_after, require_regex=require_regex,
78
+ window_size=window_size,
79
+ id_label=id_label, noteid_label=noteid_label,
80
+ notedate_label=notedate_label, notetext_label=notetext_label,
81
+ select_probability=select_probability
82
+ )
83
+ output_results(curr_outdir, not_found_text=not_found_text, note_counter=note_counter,
84
+ cat_counter_mrns=cat_counter_mrns, category_enums=category_enums,
85
+ note_to_cat=note_to_cat, mrn_to_cat=mrn_to_cat)
86
+
87
+
88
+ def search_first_regex(regexes):
89
+ """For each regex, only return first instance (use search)"""
90
+
91
+ def _search_first_regex(text):
92
+ for regex, category in regexes:
93
+ if regex.search(text):
94
+ yield category
95
+
96
+ return _search_first_regex
97
+
98
+
99
+ def search_all_regex(regexes):
100
+ """For each regex, return all (use finditer)"""
101
+
102
+ def _search_all_regex(text):
103
+ for regex, category in regexes:
104
+ for _ in regex.finditer(text):
105
+ yield category
106
+
107
+ return _search_all_regex
108
+
109
+
110
+ def get_all_regex_by_index(regexes):
111
+ """For each regex, return all results, including indices"""
112
+
113
+ def _get_all_regex_by_index(text):
114
+ for regex, category in regexes:
115
+ for m in regex.finditer(text):
116
+ yield category.name, m.group(), m.start(), m.end()
117
+
118
+ return _get_all_regex_by_index
konsepy/run_all.py ADDED
@@ -0,0 +1,57 @@
1
+ import datetime
2
+ import pathlib
3
+ from collections import Counter, defaultdict
4
+
5
+ from loguru import logger
6
+
7
+ from konsepy.cli import add_outdir_and_infiles
8
+ from konsepy.constants import NOTEDATE_LABEL, ID_LABEL, NOTEID_LABEL, NOTETEXT_LABEL
9
+ from konsepy.importer import get_all_concepts
10
+ from konsepy.regex import extract_categories
11
+ from konsepy.textio import iterate_csv_file, output_results
12
+
13
+
14
+ def run_all(input_files, outdir: pathlib.Path, package_name: str, *,
15
+ id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
16
+ notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
17
+ ):
18
+ dt = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
19
+ curr_outdir = outdir / f'run_all_{dt}'
20
+ curr_outdir.mkdir(parents=True)
21
+ logger.add(curr_outdir / f'run_all_{dt}.log')
22
+ count = 0 # default value; received from forloop below
23
+ cat_counter_notes = Counter()
24
+ cat_counter_mrns = defaultdict(set)
25
+ noteid_to_cat = defaultdict(Counter)
26
+ mrn_to_cat = defaultdict(Counter)
27
+ unique_mrns = set()
28
+ concepts = list(get_all_concepts(package_name))
29
+ logger.info(f'Loaded {len(concepts)} concepts for processing.')
30
+ for count, studyid, note_id, note_date, text in iterate_csv_file(
31
+ input_files,
32
+ id_label=id_label, noteid_label=noteid_label,
33
+ notedate_label=notedate_label, notetext_label=notetext_label,
34
+ ):
35
+ if count % 10000 == 0:
36
+ logger.info(f'Completed {count} records for {len(unique_mrns)} MRNs ({datetime.datetime.now()})')
37
+
38
+ for concept in concepts:
39
+ extract_categories(
40
+ studyid, note_id, text, concept.run_func,
41
+ cat_counter_mrns=cat_counter_mrns, cat_counter_notes=cat_counter_notes,
42
+ mrn_to_cat=mrn_to_cat, noteid_to_cat=noteid_to_cat,
43
+ unique_mrns=unique_mrns
44
+ )
45
+ logger.info(f'Finished. Total records: {count} ({datetime.datetime.now()})')
46
+ output_results(curr_outdir, note_counter=cat_counter_notes,
47
+ cat_counter_mrns=cat_counter_mrns,
48
+ category_enums=[c.category_enum for c in concepts],
49
+ note_to_cat=noteid_to_cat, mrn_to_cat=mrn_to_cat)
50
+
51
+
52
+ if __name__ == '__main__':
53
+ import argparse
54
+
55
+ parser = argparse.ArgumentParser(fromfile_prefix_chars='@!')
56
+ add_outdir_and_infiles(parser)
57
+ run_all(**vars(parser.parse_args()))
konsepy/rxutils.py ADDED
@@ -0,0 +1,30 @@
1
+ class RxType(type):
2
+ MAPPING = {
3
+ 'p': '.',
4
+ 'w': r'\w',
5
+ 'W': r'\W',
6
+ 'S': r'\S',
7
+ 's': r'\s',
8
+ 'o': r'(?:\w+\W*)', # word
9
+ }
10
+
11
+ def __getattr__(cls, item):
12
+ return cls._parse_item(item)
13
+
14
+ def _parse_item(cls, element):
15
+ el = cls.MAPPING[element[0]]
16
+ if len(element) > 1:
17
+ return el + cls._parse_numbers(*element[1:].split('_'))
18
+ return el
19
+
20
+ def _parse_numbers(cls, *nums):
21
+ if len(nums) == 1:
22
+ v1 = 0
23
+ v2 = nums[0]
24
+ else:
25
+ v1, v2 = nums
26
+ return rf'{{{v1},{v2}}}'
27
+
28
+
29
+ class Rx(metaclass=RxType):
30
+ pass
konsepy/textio.py ADDED
@@ -0,0 +1,87 @@
1
+ """
2
+ Simplify reading input files by creating an iterating wrapper.
3
+ """
4
+ import csv
5
+ import random
6
+
7
+ from loguru import logger
8
+ from sas7bdat import SAS7BDAT
9
+
10
+ from konsepy.constants import NOTEDATE_LABEL, ID_LABEL, NOTEID_LABEL, NOTETEXT_LABEL
11
+
12
+
13
+ def iterate_csv_file(input_files, *, start_after=0, stop_after=None,
14
+ id_label=ID_LABEL, noteid_label=NOTEID_LABEL,
15
+ notedate_label=NOTEDATE_LABEL, notetext_label=NOTETEXT_LABEL,
16
+ select_probability=1.0, encoding='latin1'):
17
+ """Return count, mrn, note_id, text for each row in csv file"""
18
+ count = 0
19
+ total_count = 0
20
+ for input_file in input_files:
21
+ func = _extract_sas_file if input_file.endswith('sas7bdat') else _extract_csv_file
22
+ for mrn, text, note_id, date in func(input_file, encoding, id_label, noteid_label, notedate_label,
23
+ notetext_label):
24
+ if random.random() > select_probability:
25
+ continue
26
+ total_count += 1
27
+ if start_after >= total_count:
28
+ continue
29
+ count += 1
30
+ yield count, mrn, note_id, date, text
31
+ if stop_after and count > stop_after:
32
+ return
33
+
34
+
35
+ def _extract_sas_file(input_file, encoding, id_label, noteid_label, notedate_label, notetext_label):
36
+ with SAS7BDAT(input_file, skip_header=False, encoding=encoding) as fh:
37
+ header = []
38
+ for row in fh:
39
+ if not header:
40
+ header = row
41
+ continue
42
+ mrn = row[header.index(id_label)]
43
+ date = row[header.index(notedate_label)] if notedate_label else ''
44
+ text = row[header.index(noteid_label)]
45
+ noteid = row[header.index(notetext_label)]
46
+ yield mrn, text, noteid, date
47
+
48
+
49
+ def _extract_csv_file(input_file, encoding, id_label, noteid_label, notedate_label, notetext_label):
50
+ with open(input_file, encoding=encoding) as fh:
51
+ for row in csv.DictReader(fh):
52
+ text = row[notetext_label]
53
+ mrn = row[id_label]
54
+ date = row.get(notedate_label, '')
55
+ note_id = row[noteid_label]
56
+ yield mrn, text, note_id, date
57
+
58
+
59
+ def output_results(outdir, *, not_found_text=None,
60
+ note_counter=None, cat_counter_mrns=None,
61
+ category_enums=None, note_to_cat=None, mrn_to_cat=None):
62
+ categories = [e for category_enum in category_enums for e in category_enum]
63
+ if not_found_text is not None:
64
+ with open(outdir / 'snippets.csv', 'w', newline='') as out:
65
+ writer = csv.writer(out)
66
+ writer.writerow(['count', 'snippet'])
67
+ for snippet, count in not_found_text.most_common():
68
+ writer.writerow([count, ' '.join(snippet.split())])
69
+
70
+ with open(outdir / 'category_counts.csv', 'w', newline='') as out:
71
+ writer = csv.writer(out)
72
+ writer.writerow(['category', 'note_count', 'mrn_count'])
73
+ for cat in categories:
74
+ writer.writerow([cat, note_counter[cat], len(cat_counter_mrns[cat])])
75
+
76
+ with open(outdir / 'mrn_category_counts.csv', 'w', newline='') as out:
77
+ writer = csv.DictWriter(out, ['mrn'] + categories)
78
+ writer.writeheader()
79
+ for mrn, note_counter in mrn_to_cat.items():
80
+ writer.writerow({'mrn': mrn} | dict(note_counter))
81
+
82
+ with open(outdir / 'notes_category_counts.csv', 'w', newline='') as out:
83
+ writer = csv.DictWriter(out, ['mrn', 'note_id'] + categories)
84
+ writer.writeheader()
85
+ for (mrn, note), note_counter in note_to_cat.items():
86
+ writer.writerow({'mrn': mrn, 'note_id': note} | dict(note_counter))
87
+ logger.info(f'Unique MRNs: {len(mrn_to_cat)}')
konsepy/types.py ADDED
@@ -0,0 +1,9 @@
1
+ """
2
+ Types to simplify type hinting descriptions
3
+ """
4
+ from enum import Enum
5
+ from typing import Pattern
6
+
7
+ RegexPattern = tuple[Pattern, Enum]
8
+ RegexList = list[RegexPattern]
9
+ RegexDict = dict[str, RegexList]
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.1
2
+ Name: konsepy
3
+ Version: 0.0.1
4
+ Summary: Framework for build NLP information extraction systems using regular expressions.
5
+ Keywords: nlp
6
+ Author-email: dcronkite <dcronkite+pypi@gmail.com>
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Classifier: Development Status :: 5 - Production/Stable
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Topic :: Text Processing :: Linguistic
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Intended Audience :: Healthcare Industry
17
+ Requires-Dist: loguru
18
+ Requires-Dist: pytest
19
+ Requires-Dist: sas7bdat
20
+ Requires-Dist: spacy ; extra == "ssplit"
21
+ Project-URL: Home, https://github.com/kpwhri/konsepy
22
+ Provides-Extra: ssplit
23
+
24
+
25
+ # konsepy
26
+
27
+ Framework for build NLP information extraction systems using regular expressions.
28
+
29
+ ## Usage
30
+
31
+ For now, find documentation for this library (and a template to download) from https://github.com/kpwhri/konsepy.
32
+
33
+ * Download the template
34
+
@@ -0,0 +1,14 @@
1
+ konsepy/__init__.py,sha256=VfLugWKDQ_0IMhBVncNB6hmBljR4-v3bwKh0N5PSfKw,112
2
+ konsepy/bio_tag.py,sha256=sv1aIgO26Sw0bShUDVVDgWkCRF9nP485ChTCs5qeROY,4059
3
+ konsepy/cli.py,sha256=3avT_aMFapWFWu-33nNv7SPqulWEOk7SrsNogRQd9SY,2817
4
+ konsepy/constants.py,sha256=1zPrlq1fHHOLDtE1cQNS0C9BEiWdqJbZDI_Bb3n0wrw,108
5
+ konsepy/get_text_snippets.py,sha256=aAO-gdsm4X6I0jv1eALd3DViy_q9XqOBcK7j72t0_Qc,4111
6
+ konsepy/importer.py,sha256=B3gIaSsIprrM9mWsbqPHFQC_vtg6yQUKJow2cEiMhKI,1613
7
+ konsepy/regex.py,sha256=Ik78Gf3pV_hNH_XmhggyzMGwIx1pwfWkaqgonWn1Smo,5204
8
+ konsepy/run_all.py,sha256=jOemBSQPiHNF2o0zbUYciX_Y6EwiTikyKA5Z9pe0YA4,2447
9
+ konsepy/rxutils.py,sha256=NLOHL_VPcPc57Ve5X-RMCcNSgPtsGXPbSzYRMFVhg6M,684
10
+ konsepy/textio.py,sha256=G5DyWLlgkDbyaXvG_uu0ahuX_SZee7giosmX1vChT2M,3825
11
+ konsepy/types.py,sha256=xquwcA0PbW2biyL_5i8W7747Bl1pv9Kh7hLzIuxRjNc,211
12
+ konsepy-0.0.1.dist-info/WHEEL,sha256=rSgq_JpHF9fHR1lx53qwg_1-2LypZE_qmcuXbVUq948,81
13
+ konsepy-0.0.1.dist-info/METADATA,sha256=cqowIiSjninoCe3ZcK2HUCfPm52lDF-cYharqKhKbgU,1121
14
+ konsepy-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: flit 3.8.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any