pat2vec 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pat2vec-0.1.1/MANIFEST.in +3 -0
- pat2vec-0.1.1/PKG-INFO +391 -0
- pat2vec-0.1.1/README.md +323 -0
- pat2vec-0.1.1/pat2vec/__init__.py +463 -0
- pat2vec-0.1.1/pat2vec/main_pat2vec.py +720 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/__init__.py +4 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_appointments.py +220 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_bed.py +177 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_bloods.py +388 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_bmi.py +281 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_core02.py +250 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_core_resus.py +239 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_current_pat_annotations_mrc_cs.py +126 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_demo.py +196 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_demographics.py +389 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_diagnostics.py +354 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_drugs.py +330 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_hosp_site.py +169 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_news.py +152 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_pat_annotations.py +128 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_report_annotations.py +116 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_smoking.py +224 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_textual_obs_annotations.py +141 -0
- pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_vte_status.py +229 -0
- pat2vec-0.1.1/pat2vec/pat2vec_main_methods/__init__.py +0 -0
- pat2vec-0.1.1/pat2vec/pat2vec_main_methods/main_batch.py +341 -0
- pat2vec-0.1.1/pat2vec/pat2vec_pat_list/__init__.py +0 -0
- pat2vec-0.1.1/pat2vec/pat2vec_pat_list/get_patient_treatment_list.py +437 -0
- pat2vec-0.1.1/pat2vec/pat2vec_search/cogstack_search_methods.py +1356 -0
- pat2vec-0.1.1/pat2vec/pat2vec_search/data_helper_functions.py +105 -0
- pat2vec-0.1.1/pat2vec/pat2vec_search/matcher.py +94 -0
- pat2vec-0.1.1/pat2vec/pat2vec_search/nearest.py +50 -0
- pat2vec-0.1.1/pat2vec/pat2vec_search/search_helper_functions.py +160 -0
- pat2vec-0.1.1/pat2vec/pat2vec_search/search_multiprocess.py +100 -0
- pat2vec-0.1.1/pat2vec/patvec_get_batch_methods/__init__.py +0 -0
- pat2vec-0.1.1/pat2vec/patvec_get_batch_methods/get_merged_batches.py +1456 -0
- pat2vec-0.1.1/pat2vec/patvec_get_batch_methods/get_prefetch_batches.py +225 -0
- pat2vec-0.1.1/pat2vec/patvec_get_batch_methods/main.py +1370 -0
- pat2vec-0.1.1/pat2vec/tests/config_pat2vec.py +0 -0
- pat2vec-0.1.1/pat2vec/tests/test_calculate_interval.py +146 -0
- pat2vec-0.1.1/pat2vec/tests/test_config_class.py +221 -0
- pat2vec-0.1.1/pat2vec/tests/test_filter_dataframe_by_timestamp_extended.py +164 -0
- pat2vec-0.1.1/pat2vec/tests/test_generate_date_list.py +223 -0
- pat2vec-0.1.1/pat2vec/tests/test_get_dummy_data_cohort_searcher_get_date.py +95 -0
- pat2vec-0.1.1/pat2vec/tests/test_get_method_bloods.py +0 -0
- pat2vec-0.1.1/pat2vec/tests/test_get_start_end_year_month.py +173 -0
- pat2vec-0.1.1/pat2vec/tests/test_global_date_validation.py +126 -0
- pat2vec-0.1.1/pat2vec/tests/test_individual_patient_window.py +297 -0
- pat2vec-0.1.1/pat2vec/tests/test_methods_annotation_filter_annot_dataframe.py +139 -0
- pat2vec-0.1.1/pat2vec/tests/test_methods_annotation_multi_annots_to_df.py +259 -0
- pat2vec-0.1.1/pat2vec/tests/test_methods_get.py +229 -0
- pat2vec-0.1.1/pat2vec/tests/test_parse_date.py +139 -0
- pat2vec-0.1.1/pat2vec/tests/test_post_processing_build_ipw_dataframe.py +318 -0
- pat2vec-0.1.1/pat2vec/tests/test_post_processing_get_pat_ipw_record.py +484 -0
- pat2vec-0.1.1/pat2vec/tests/test_post_processing_process_csv_files.py +169 -0
- pat2vec-0.1.1/pat2vec/util/__init__.py +0 -0
- pat2vec-0.1.1/pat2vec/util/anonymisation_data_methods.py +243 -0
- pat2vec-0.1.1/pat2vec/util/anonymisation_deid_documents.py +616 -0
- pat2vec-0.1.1/pat2vec/util/calculate_interval.py +50 -0
- pat2vec-0.1.1/pat2vec/util/clinical_note_splitter.py +327 -0
- pat2vec-0.1.1/pat2vec/util/compile_requirements.py +97 -0
- pat2vec-0.1.1/pat2vec/util/config_pat2vec.py +1216 -0
- pat2vec-0.1.1/pat2vec/util/credentials.py +34 -0
- pat2vec-0.1.1/pat2vec/util/current_pat_batch_path_methods.py +84 -0
- pat2vec-0.1.1/pat2vec/util/dummy_data_files/__init__.py +0 -0
- pat2vec-0.1.1/pat2vec/util/dummy_data_files/dummy_lists.py +1899 -0
- pat2vec-0.1.1/pat2vec/util/elasticsearch_methods.py +284 -0
- pat2vec-0.1.1/pat2vec/util/ethnicity_abstractor.py +730 -0
- pat2vec-0.1.1/pat2vec/util/evaluation_methods.py +232 -0
- pat2vec-0.1.1/pat2vec/util/evaluation_methods_ploting.py +116 -0
- pat2vec-0.1.1/pat2vec/util/filter_dataframe_by_timestamp.py +86 -0
- pat2vec-0.1.1/pat2vec/util/filter_methods.py +213 -0
- pat2vec-0.1.1/pat2vec/util/generate_date_list.py +145 -0
- pat2vec-0.1.1/pat2vec/util/get_best_gpu.py +31 -0
- pat2vec-0.1.1/pat2vec/util/get_dummy_data_cohort_searcher.py +1864 -0
- pat2vec-0.1.1/pat2vec/util/get_dummy_data_medcat_annotation.py +104 -0
- pat2vec-0.1.1/pat2vec/util/get_start_end_year_month.py +51 -0
- pat2vec-0.1.1/pat2vec/util/helper_functions.py +70 -0
- pat2vec-0.1.1/pat2vec/util/impute_data_for_pipe.py +148 -0
- pat2vec-0.1.1/pat2vec/util/logger_setup.py +122 -0
- pat2vec-0.1.1/pat2vec/util/medcat_misc_methods.py +668 -0
- pat2vec-0.1.1/pat2vec/util/methods_annotation.py +527 -0
- pat2vec-0.1.1/pat2vec/util/methods_annotation_filter_annot_dataframe.py +71 -0
- pat2vec-0.1.1/pat2vec/util/methods_annotation_get_pat_document_annotation_batch.py +243 -0
- pat2vec-0.1.1/pat2vec/util/methods_annotation_json_to_dataframe.py +240 -0
- pat2vec-0.1.1/pat2vec/util/methods_annotation_multi_annots_to_df.py +164 -0
- pat2vec-0.1.1/pat2vec/util/methods_annotation_regex.py +39 -0
- pat2vec-0.1.1/pat2vec/util/methods_get.py +995 -0
- pat2vec-0.1.1/pat2vec/util/methods_get_medcat.py +121 -0
- pat2vec-0.1.1/pat2vec/util/methods_post_get.py +190 -0
- pat2vec-0.1.1/pat2vec/util/parse_date.py +74 -0
- pat2vec-0.1.1/pat2vec/util/post_processing.py +1485 -0
- pat2vec-0.1.1/pat2vec/util/post_processing_build_ipw_dataframe.py +80 -0
- pat2vec-0.1.1/pat2vec/util/post_processing_build_methods.py +683 -0
- pat2vec-0.1.1/pat2vec/util/post_processing_get_pat_ipw_record.py +379 -0
- pat2vec-0.1.1/pat2vec/util/post_processing_medcat.py +230 -0
- pat2vec-0.1.1/pat2vec/util/post_processing_process_csv_files.py +293 -0
- pat2vec-0.1.1/pat2vec/util/pre_get_drug_treatment_docs.py +465 -0
- pat2vec-0.1.1/pat2vec/util/pre_processing.py +457 -0
- pat2vec-0.1.1/pat2vec/util/presentation_methods.py +106 -0
- pat2vec-0.1.1/pat2vec/util/testing_helpers.py +29 -0
- pat2vec-0.1.1/pat2vec.egg-info/PKG-INFO +391 -0
- pat2vec-0.1.1/pat2vec.egg-info/SOURCES.txt +108 -0
- pat2vec-0.1.1/pat2vec.egg-info/dependency_links.txt +1 -0
- pat2vec-0.1.1/pat2vec.egg-info/requires.txt +51 -0
- pat2vec-0.1.1/pat2vec.egg-info/top_level.txt +2 -0
- pat2vec-0.1.1/pat2vec_env/bin/vba_extract.py +79 -0
- pat2vec-0.1.1/pyproject.toml +87 -0
- pat2vec-0.1.1/setup.cfg +4 -0
- pat2vec-0.1.1/setup.py +24 -0
pat2vec-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pat2vec
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A comprehensive Python package for healthcare data engineering, designed to extract, transform, and feature engineer patient data from CogStack-based electronic health record (EHR) datalakes. It provides tools for cohort building, batch data processing, clinical note analysis, and creating machine learning-ready datasets.
|
|
5
|
+
Home-page: https://github.com/SamoraHunter/pat2vec.git
|
|
6
|
+
Author: Samora Hunter
|
|
7
|
+
Author-email: Samora Hunter <samorahunter@gmail.com>
|
|
8
|
+
Project-URL: Homepage, https://github.com/SamoraHunter/pat2vec
|
|
9
|
+
Project-URL: Documentation, https://samorahunter.github.io/pat2vec/
|
|
10
|
+
Project-URL: Repository, https://github.com/SamoraHunter/pat2vec
|
|
11
|
+
Project-URL: Changelog, https://github.com/SamoraHunter/pat2vec/releases
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: medcat==1.16.0
|
|
18
|
+
Requires-Dist: paramiko
|
|
19
|
+
Requires-Dist: colorama
|
|
20
|
+
Requires-Dist: elasticsearch==8.17.0
|
|
21
|
+
Requires-Dist: eland==8.17.0
|
|
22
|
+
Requires-Dist: faker
|
|
23
|
+
Requires-Dist: fuzzywuzzy
|
|
24
|
+
Requires-Dist: seaborn
|
|
25
|
+
Requires-Dist: rapidfuzz
|
|
26
|
+
Requires-Dist: python-pptx
|
|
27
|
+
Requires-Dist: ipykernel
|
|
28
|
+
Requires-Dist: transformers==4.48.1
|
|
29
|
+
Requires-Dist: accelerate==1.3.0
|
|
30
|
+
Requires-Dist: peft==0.8.2
|
|
31
|
+
Requires-Dist: huggingface-hub==0.27.1
|
|
32
|
+
Requires-Dist: polars
|
|
33
|
+
Requires-Dist: pandas>=1.5.3
|
|
34
|
+
Requires-Dist: numpy>=1.25.2
|
|
35
|
+
Provides-Extra: all
|
|
36
|
+
Requires-Dist: torch>=2.5.1; extra == "all"
|
|
37
|
+
Requires-Dist: scikit-learn>=1.6.1; extra == "all"
|
|
38
|
+
Requires-Dist: jupyterlab; extra == "all"
|
|
39
|
+
Requires-Dist: lifelines==0.28.0; extra == "all"
|
|
40
|
+
Requires-Dist: bokeh==3.6.2; extra == "all"
|
|
41
|
+
Requires-Dist: dask; extra == "all"
|
|
42
|
+
Requires-Dist: datasets; extra == "all"
|
|
43
|
+
Requires-Dist: numba; extra == "all"
|
|
44
|
+
Requires-Dist: statsmodels; extra == "all"
|
|
45
|
+
Requires-Dist: wordcloud; extra == "all"
|
|
46
|
+
Requires-Dist: matplotlib-venn; extra == "all"
|
|
47
|
+
Requires-Dist: nltk; extra == "all"
|
|
48
|
+
Requires-Dist: sqlalchemy; extra == "all"
|
|
49
|
+
Requires-Dist: openpyxl; extra == "all"
|
|
50
|
+
Requires-Dist: pydot; extra == "all"
|
|
51
|
+
Requires-Dist: pyodbc; extra == "all"
|
|
52
|
+
Requires-Dist: python-tds; extra == "all"
|
|
53
|
+
Requires-Dist: umls-api; extra == "all"
|
|
54
|
+
Provides-Extra: dev
|
|
55
|
+
Requires-Dist: pytest; extra == "dev"
|
|
56
|
+
Requires-Dist: nbformat; extra == "dev"
|
|
57
|
+
Requires-Dist: nbconvert; extra == "dev"
|
|
58
|
+
Requires-Dist: nbstripout; extra == "dev"
|
|
59
|
+
Requires-Dist: nbmake; extra == "dev"
|
|
60
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
61
|
+
Requires-Dist: sphinx==8.1.3; extra == "dev"
|
|
62
|
+
Requires-Dist: sphinx-rtd-theme==3.0.2; extra == "dev"
|
|
63
|
+
Requires-Dist: myst-parser==4.0.1; extra == "dev"
|
|
64
|
+
Requires-Dist: sphinx-autodoc-typehints==3.0.1; extra == "dev"
|
|
65
|
+
Requires-Dist: sphinxcontrib-mermaid; extra == "dev"
|
|
66
|
+
Dynamic: author
|
|
67
|
+
Dynamic: home-page
|
|
68
|
+
|
|
69
|
+
[](https://samorahunter.github.io/pat2vec/)
|
|
70
|
+
[](https://opensource.org/licenses/MIT)
|
|
71
|
+

|
|
72
|
+
|
|
73
|
+
## Table of Contents
|
|
74
|
+
- [Overview](#overview)
|
|
75
|
+
- [Documentation](#documentation)
|
|
76
|
+
- [Example Use Cases](#example-use-cases)
|
|
77
|
+
- [1. Patient-Level Aggregation](#1-patient-level-aggregation)
|
|
78
|
+
- [2. Longitudinal Time Series Construction](#2-longitudinal-time-series-construction)
|
|
79
|
+
- [Requirements](#requirements)
|
|
80
|
+
- [Features](#features)
|
|
81
|
+
- [📊 Diagrams](#-diagrams)
|
|
82
|
+
- [System Architecture & Configuration](#system-architecture--configuration)
|
|
83
|
+
- [Data Pipelines](#data-pipelines)
|
|
84
|
+
- [Methods & Post-Processing](#methods--post-processing)
|
|
85
|
+
- [Feature Extraction](#feature-extraction)
|
|
86
|
+
- [Installation](#installation)
|
|
87
|
+
- [Windows](#windows)
|
|
88
|
+
- [Unix/Linux](#unixlinux)
|
|
89
|
+
- [Usage](#usage)
|
|
90
|
+
- [FAQ](#faq)
|
|
91
|
+
- [Citation](#citation)
|
|
92
|
+
- [Contributing](#contributing)
|
|
93
|
+
- [Code of Conduct](#code-of-conduct)
|
|
94
|
+
- [License](#license)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# Overview
|
|
98
|
+
|
|
99
|
+
This tool converts individual patient records into structured time-interval feature vectors, making them suitable for filtering, aggregation, and assembly into a data matrix **D** for binary classification machine learning tasks.
|
|
100
|
+
|
|
101
|
+
## Documentation
|
|
102
|
+
|
|
103
|
+
The full API documentation for `pat2vec` is automatically generated and hosted on GitHub Pages.
|
|
104
|
+
|
|
105
|
+
**View the Live Documentation**
|
|
106
|
+
|
|
107
|
+
## Example Use Cases
|
|
108
|
+
|
|
109
|
+
### 1. Patient-Level Aggregation
|
|
110
|
+
Compute summary statistics (e.g., the mean of *n* variables) for each unique patient, resulting in one row per patient. This is ideal for models requiring a single representation per individual.
|
|
111
|
+
|
|
112
|
+
### 2. Longitudinal Time Series Construction
|
|
113
|
+
Generate a monthly time series for each patient that includes:
|
|
114
|
+
|
|
115
|
+
- Biochemistry results
|
|
116
|
+
- Demographic attributes
|
|
117
|
+
- MedCat-derived clinical text annotations
|
|
118
|
+
|
|
119
|
+
The time series spans up to 25 years retrospectively, aligned to each patient's diagnosis date, enabling a consistent retrospective view across varying start times.
|
|
120
|
+
|
|
121
|
+
## Requirements
|
|
122
|
+
|
|
123
|
+
**Core Services:**
|
|
124
|
+
- **CogStack**: An operational instance for data retrieval. The required client libraries are now bundled with this project.
|
|
125
|
+
- **Elasticsearch**: The backend for CogStack.
|
|
126
|
+
- **MedCAT**: For medical concept annotation.
|
|
127
|
+
|
|
128
|
+
**Local Setup:**
|
|
129
|
+
- **Python**: Version 3.10 or higher.
|
|
130
|
+
- **Virtual Environment**: Requires the `python3-venv` package (or equivalent for your OS).
|
|
131
|
+
- For all other Python packages, see `requirements.txt`.
|
|
132
|
+
|
|
133
|
+
## Features
|
|
134
|
+
|
|
135
|
+
`pat2vec` offers a flexible suite of tools for processing and analyzing patient data.
|
|
136
|
+
|
|
137
|
+
**Patient Processing**
|
|
138
|
+
- **Single & Batch Processing**: Process individual patients for detailed analysis or run large batches for cohort-level studies.
|
|
139
|
+
|
|
140
|
+
**Cohort Management**
|
|
141
|
+
- **Cohort Search & Creation**: Define and build patient cohorts using flexible search criteria.
|
|
142
|
+
- **Automated Control Matching**: Automatically generate random control groups for case-control studies.
|
|
143
|
+
|
|
144
|
+
**Flexible Feature Engineering**
|
|
145
|
+
- **Modular Feature Selection**: Choose from a wide range of feature extractors to build a custom feature space tailored to your research question.
|
|
146
|
+
- **Temporal Windowing**: Define precise time windows for data extraction relative to a key event (e.g., diagnosis date), including look-back and look-forward periods.
|
|
147
|
+
|
|
148
|
+
## 📊 Diagrams
|
|
149
|
+
|
|
150
|
+
<details>
|
|
151
|
+
<summary>Click to view project diagrams</summary>
|
|
152
|
+
|
|
153
|
+
This project includes a collection of diagrams illustrating the system architecture, data pipelines, and feature extraction workflows. You can view the Mermaid definitions or the rendered diagrams below.
|
|
154
|
+
|
|
155
|
+
#### 📂 System Architecture & Configuration
|
|
156
|
+
| Diagram | Mermaid | Image |
|
|
157
|
+
|---|---|---|
|
|
158
|
+
| **System Architecture** | [assets/system_architecture.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/system_architecture.mmd) |  |
|
|
159
|
+
| **Configuration** | [assets/config.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/config.mmd) |  |
|
|
160
|
+
|
|
161
|
+
#### 🛠️ Data Pipelines
|
|
162
|
+
| Diagram | Mermaid | Image |
|
|
163
|
+
|---|---|---|
|
|
164
|
+
| **Data Pipeline** | [assets/data_pipeline.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/data_pipeline.mmd) |  |
|
|
165
|
+
| **Main Batch Processing** | [assets/main_batch.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/main_batch.mmd) |  |
|
|
166
|
+
| **Example Ingestion** | [assets/example_ingestion.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/example_ingestion.mmd) | <img src="https://github.com/SamoraHunter/pat2vec/blob/main/assets/example_ingestion.png?raw=true" alt="Example Ingestion" height="400"/> |
|
|
167
|
+
|
|
168
|
+
#### 🧩 Methods & Post-Processing
|
|
169
|
+
| Diagram | Mermaid | Image |
|
|
170
|
+
|---|---|---|
|
|
171
|
+
| **Methods Annotation** | [assets/methods_annotation.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/methods_annotation.mmd) |  |
|
|
172
|
+
| **Post-Processing Build Methods** | [assets/post_processing_build_methods.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/post_processing_build_methods.mmd) |  |
|
|
173
|
+
| **Post-Processing Anonymisation** | [assets/post_processing_anonymisation_high_level.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/post_processing_anonymisation_high_level.mmd) |  |
|
|
174
|
+
|
|
175
|
+
#### 🔍 Feature Extraction
|
|
176
|
+
| Diagram | Mermaid | Image |
|
|
177
|
+
|---|---|---|
|
|
178
|
+
| **Ethnicity Abstractor** | [assets/ethnicity_abstractor.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/ethnicity_abstractor.mmd) |  |
|
|
179
|
+
| **Get BMI** | [assets/get_bmi.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_bmi.mmd) |  |
|
|
180
|
+
| **Get Demographics** | [assets/get_demographics.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_demographics.mmd) |  |
|
|
181
|
+
| **Get Diagnostics** | [assets/get_diagnostics.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_diagnostics.mmd) |  |
|
|
182
|
+
| **Get Drugs** | [assets/get_drugs.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_drugs.mmd) |  |
|
|
183
|
+
| **Get Smoking** | [assets/get_smoking.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_smoking.mmd) |  |
|
|
184
|
+
| **Get News** | [assets/get_news.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_news.mmd) |  |
|
|
185
|
+
| **Get Dummy Data Cohort Searcher** | [assets/get_dummy_data_cohort_searcher.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_dummy_data_cohort_searcher.mmd) |  |
|
|
186
|
+
| **Get Method Bloods** | [assets/get_method_bloods.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_method_bloods.mmd) |  |
|
|
187
|
+
| **Get Method Patient Annotations** | [assets/get_method_pat_annotations.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_method_pat_annotations.mmd) |  |
|
|
188
|
+
| **Get Treatment Docs (No Terms Fuzzy)** | [assets/get_treatment_docs_by_iterative_multi_term_cohort_searcher_no_terms_fuzzy.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_treatment_docs_by_iterative_multi_term_cohort_searcher_no_terms_fuzzy.mmd) |  |
|
|
189
|
+
|
|
190
|
+
</details>
|
|
191
|
+
|
|
192
|
+
## Installation
|
|
193
|
+
|
|
194
|
+
### From PyPI (Recommended for Users)
|
|
195
|
+
|
|
196
|
+
Once `pat2vec` is installed, you can use it as a library in your Python projects.
|
|
197
|
+
|
|
198
|
+
1. **Install the package:**
|
|
199
|
+
```shell
|
|
200
|
+
pip install pat2vec
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
2. **Install all optional dependencies (for full functionality):**
|
|
204
|
+
```shell
|
|
205
|
+
pip install pat2vec[all]
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### From Source (For Developers/Contributors)
|
|
209
|
+
The following instructions are for setting up a development environment from the source code.
|
|
210
|
+
|
|
211
|
+
#### Windows
|
|
212
|
+
|
|
213
|
+
1. **Clone the repository:**
|
|
214
|
+
Navigate to the directory where you want to store your projects. It's recommended to have a parent directory to hold `pat2vec` and its related assets.
|
|
215
|
+
|
|
216
|
+
```shell
|
|
217
|
+
git clone https://github.com/SamoraHunter/pat2vec.git
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
2. **Run the installation script:**
|
|
221
|
+
Navigate into the cloned repository and run the batch script. This will create a Python virtual environment, install dependencies from `requirements.txt`, and set up a Jupyter kernel.
|
|
222
|
+
|
|
223
|
+
```shell
|
|
224
|
+
cd pat2vec
|
|
225
|
+
install.bat
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
3. **Activate the environment:**
|
|
229
|
+
To use the installed packages, activate the virtual environment:
|
|
230
|
+
```shell
|
|
231
|
+
pat2vec_env\Scripts\activate
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
4. **Set up for your IDE/Notebook:**
|
|
235
|
+
If you are using an IDE like VS Code or a Jupyter Notebook, make sure to select the `pat2vec_env` kernel to run your code.
|
|
236
|
+
|
|
237
|
+
5. **Post-Installation Setup:**
|
|
238
|
+
The script sets up the Python environment, but you must manually arrange other project assets. In the parent directory of your `pat2vec` clone, you will need to:
|
|
239
|
+
- **Clone the helper repository**:
|
|
240
|
+
```shell
|
|
241
|
+
git clone https://github.com/SamoraHunter/snomed_methods.git
|
|
242
|
+
```
|
|
243
|
+
- **Add MedCAT model**: Create a `medcat_models` directory and copy your MedCAT model pack (`.zip`) into it.
|
|
244
|
+
- **Add credentials**: Create a `credentials.py` file. You can use `pat2vec/pat2vec/config/credentials_template.py` as a starting point.
|
|
245
|
+
|
|
246
|
+
Your final directory structure should look like the one described in the Usage section.
|
|
247
|
+
|
|
248
|
+
#### Unix/Linux
|
|
249
|
+
|
|
250
|
+
The `install_pat2vec.sh` script is the recommended way to set up a development environment on Unix-like systems. It automates the full setup, including:
|
|
251
|
+
- Creating a Python virtual environment (`pat2vec_env`).
|
|
252
|
+
- Installing Python dependencies (including development and testing tools).
|
|
253
|
+
- Cloning the `snomed_methods` helper repository.
|
|
254
|
+
- Creating required directories and template files (e.g., for MedCAT models and credentials).
|
|
255
|
+
|
|
256
|
+
To install, clone the repository, navigate into it, and run the script:
|
|
257
|
+
Grant execution permissions and run the script. It must be run from within the `pat2vec` directory.
|
|
258
|
+
|
|
259
|
+
```shell
|
|
260
|
+
chmod +x install_pat2vec.sh
|
|
261
|
+
./install_pat2vec.sh
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
The script supports several options:
|
|
265
|
+
- `--proxy`: Use if you are behind a corporate proxy that mirrors Python packages.
|
|
266
|
+
- `--dev`: Installs development dependencies (e.g., `pytest`, `nbmake`) for running tests.
|
|
267
|
+
- `--all`: Installs all optional feature dependencies.
|
|
268
|
+
- `--force`: Removes any existing virtual environment and performs a clean installation.
|
|
269
|
+
- `--no-clone`: Skips cloning the `snomed_methods` repository if you already have it.
|
|
270
|
+
|
|
271
|
+
For example, to install for development behind a proxy:
|
|
272
|
+
```shell
|
|
273
|
+
./install_pat2vec.sh --proxy --dev
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
After running the script, you must perform two manual steps:
|
|
277
|
+
The script creates a directory structure in the parent folder of `pat2vec`.
|
|
278
|
+
- **Place MedCAT model:** Copy your model pack into the `medcat_models` directory created by the script.
|
|
279
|
+
- **Populate credentials:** Edit the `credentials.py` file created by the script and fill in your details.
|
|
280
|
+
|
|
281
|
+
Finally, activate the environment to begin working:
|
|
282
|
+
```shell
|
|
283
|
+
source pat2vec_env/bin/activate
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
## Usage
|
|
287
|
+
|
|
288
|
+
This guide outlines the steps to run a `pat2vec` analysis after completing the installation.
|
|
289
|
+
|
|
290
|
+
### 1. Finalize Project Setup
|
|
291
|
+
|
|
292
|
+
Before running an analysis, ensure your project directory is set up correctly. If you used the `install_pat2vec.sh` script, much of this is done for you.
|
|
293
|
+
|
|
294
|
+
1. **Populate `credentials.py`**: In the parent directory of your `pat2vec` clone, edit `credentials.py` with your Elasticsearch credentials.
|
|
295
|
+
2. **Add MedCAT Model**: Copy your MedCAT model pack (`.zip`) into the `medcat_models` directory.
|
|
296
|
+
|
|
297
|
+
Your final directory structure should look like this:
|
|
298
|
+
|
|
299
|
+
```
|
|
300
|
+
your_project_folder/
|
|
301
|
+
├── credentials.py # <-- Populated with your credentials
|
|
302
|
+
├── medcat_models/
|
|
303
|
+
│ └── your_model.zip # <-- Your MedCAT model pack
|
|
304
|
+
├── snomed_methods/ # <-- Cloned helper repository
|
|
305
|
+
└── pat2vec/ # <-- This repository
|
|
306
|
+
├── notebooks/
|
|
307
|
+
│ └── example_usage.ipynb
|
|
308
|
+
└── ...
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### 2. Prepare Input Data
|
|
312
|
+
|
|
313
|
+
Create a CSV file containing your patient cohort. This file must include:
|
|
314
|
+
- A column named `client_idcode` with unique patient identifiers.
|
|
315
|
+
- Any other relevant columns, such as a diagnosis date for aligning time series data.
|
|
316
|
+
|
|
317
|
+
Place this file in an accessible location, such as a new `data` folder inside `pat2vec/notebooks/`.
|
|
318
|
+
|
|
319
|
+
### 3. Configure and Run
|
|
320
|
+
|
|
321
|
+
The `example_usage.ipynb` notebook provides a template for running the pipeline.
|
|
322
|
+
|
|
323
|
+
1. **Open the Notebook**: Navigate to `pat2vec/notebooks/` and open `example_usage.ipynb`.
|
|
324
|
+
2. **Select the Kernel**: Ensure the `pat2vec_env` Jupyter kernel is active.
|
|
325
|
+
3. **Configure the Analysis**: In the notebook, locate the `config_class`. This object controls all parameters for your run. You will need to set:
|
|
326
|
+
- Paths to your input cohort CSV and output directories.
|
|
327
|
+
- The list of features to extract.
|
|
328
|
+
- Time windows for data extraction (look-back/look-forward periods).
|
|
329
|
+
4. **Run the Pipeline**: Execute the cells in the notebook to process your data.
|
|
330
|
+
|
|
331
|
+
> **Note:** When working with real patient data, ensure the `testing` flag in the `config_class` is set to `False`.
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
## Building the Documentation
|
|
335
|
+
|
|
336
|
+
This project uses Sphinx to generate documentation from the source code's docstrings.
|
|
337
|
+
|
|
338
|
+
1. **Install development dependencies:**
|
|
339
|
+
If you haven't already, run the installation script with the `--dev` flag to install Sphinx and its extensions.
|
|
340
|
+
```shell
|
|
341
|
+
./install_pat2vec.sh --dev
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
2. **Activate the virtual environment:**
|
|
345
|
+
```shell
|
|
346
|
+
source pat2vec_env/bin/activate
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
3. **Build the HTML documentation:**
|
|
350
|
+
Navigate to the `docs/` directory and use the provided `Makefile`.
|
|
351
|
+
```shell
|
|
352
|
+
cd docs
|
|
353
|
+
make html
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
4. **View the documentation:**
|
|
357
|
+
The generated files will be in `docs/build/html/`. You can open the main page in your browser:
|
|
358
|
+
```
|
|
359
|
+
open docs/build/html/index.html
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
## FAQ
|
|
363
|
+
|
|
364
|
+
For answers to common questions, troubleshooting tips, and more detailed explanations of project concepts, please see our Frequently Asked Questions page.
|
|
365
|
+
- [Frequently Asked Questions](./docs/source/Frequently-Asked-Questions.md)
|
|
366
|
+
|
|
367
|
+
## Citation
|
|
368
|
+
|
|
369
|
+
If you use `pat2vec` in your research, please cite it. This helps to credit the work and allows others to find the tool.
|
|
370
|
+
|
|
371
|
+
```bibtex
|
|
372
|
+
@software{hunter_pat2vec_2024,
|
|
373
|
+
author = {Hunter, Samora},
|
|
374
|
+
title = {pat2vec: A tool for transforming EHR data into feature vectors for machine learning},
|
|
375
|
+
year = {2024},
|
|
376
|
+
publisher = {GitHub},
|
|
377
|
+
journal = {GitHub repository},
|
|
378
|
+
howpublished = {\url{https://github.com/SamoraHunter/pat2vec}}
|
|
379
|
+
}
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
## Contributing
|
|
383
|
+
|
|
384
|
+
Contributions are welcome! Please see the contributing guidelines for more information.
|
|
385
|
+
|
|
386
|
+
## Code of Conduct
|
|
387
|
+
|
|
388
|
+
This project and everyone participating in it is governed by a Code of Conduct. By participating, you are expected to uphold this code. Please report any unacceptable behavior.
|
|
389
|
+
|
|
390
|
+
## License
|
|
391
|
+
This project is licensed under the MIT License - see the LICENSE file for details
|