pat2vec 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. pat2vec-0.1.1/MANIFEST.in +3 -0
  2. pat2vec-0.1.1/PKG-INFO +391 -0
  3. pat2vec-0.1.1/README.md +323 -0
  4. pat2vec-0.1.1/pat2vec/__init__.py +463 -0
  5. pat2vec-0.1.1/pat2vec/main_pat2vec.py +720 -0
  6. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/__init__.py +4 -0
  7. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_appointments.py +220 -0
  8. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_bed.py +177 -0
  9. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_bloods.py +388 -0
  10. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_bmi.py +281 -0
  11. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_core02.py +250 -0
  12. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_core_resus.py +239 -0
  13. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_current_pat_annotations_mrc_cs.py +126 -0
  14. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_demo.py +196 -0
  15. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_demographics.py +389 -0
  16. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_diagnostics.py +354 -0
  17. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_drugs.py +330 -0
  18. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_hosp_site.py +169 -0
  19. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_news.py +152 -0
  20. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_pat_annotations.py +128 -0
  21. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_report_annotations.py +116 -0
  22. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_smoking.py +224 -0
  23. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_textual_obs_annotations.py +141 -0
  24. pat2vec-0.1.1/pat2vec/pat2vec_get_methods/get_method_vte_status.py +229 -0
  25. pat2vec-0.1.1/pat2vec/pat2vec_main_methods/__init__.py +0 -0
  26. pat2vec-0.1.1/pat2vec/pat2vec_main_methods/main_batch.py +341 -0
  27. pat2vec-0.1.1/pat2vec/pat2vec_pat_list/__init__.py +0 -0
  28. pat2vec-0.1.1/pat2vec/pat2vec_pat_list/get_patient_treatment_list.py +437 -0
  29. pat2vec-0.1.1/pat2vec/pat2vec_search/cogstack_search_methods.py +1356 -0
  30. pat2vec-0.1.1/pat2vec/pat2vec_search/data_helper_functions.py +105 -0
  31. pat2vec-0.1.1/pat2vec/pat2vec_search/matcher.py +94 -0
  32. pat2vec-0.1.1/pat2vec/pat2vec_search/nearest.py +50 -0
  33. pat2vec-0.1.1/pat2vec/pat2vec_search/search_helper_functions.py +160 -0
  34. pat2vec-0.1.1/pat2vec/pat2vec_search/search_multiprocess.py +100 -0
  35. pat2vec-0.1.1/pat2vec/patvec_get_batch_methods/__init__.py +0 -0
  36. pat2vec-0.1.1/pat2vec/patvec_get_batch_methods/get_merged_batches.py +1456 -0
  37. pat2vec-0.1.1/pat2vec/patvec_get_batch_methods/get_prefetch_batches.py +225 -0
  38. pat2vec-0.1.1/pat2vec/patvec_get_batch_methods/main.py +1370 -0
  39. pat2vec-0.1.1/pat2vec/tests/config_pat2vec.py +0 -0
  40. pat2vec-0.1.1/pat2vec/tests/test_calculate_interval.py +146 -0
  41. pat2vec-0.1.1/pat2vec/tests/test_config_class.py +221 -0
  42. pat2vec-0.1.1/pat2vec/tests/test_filter_dataframe_by_timestamp_extended.py +164 -0
  43. pat2vec-0.1.1/pat2vec/tests/test_generate_date_list.py +223 -0
  44. pat2vec-0.1.1/pat2vec/tests/test_get_dummy_data_cohort_searcher_get_date.py +95 -0
  45. pat2vec-0.1.1/pat2vec/tests/test_get_method_bloods.py +0 -0
  46. pat2vec-0.1.1/pat2vec/tests/test_get_start_end_year_month.py +173 -0
  47. pat2vec-0.1.1/pat2vec/tests/test_global_date_validation.py +126 -0
  48. pat2vec-0.1.1/pat2vec/tests/test_individual_patient_window.py +297 -0
  49. pat2vec-0.1.1/pat2vec/tests/test_methods_annotation_filter_annot_dataframe.py +139 -0
  50. pat2vec-0.1.1/pat2vec/tests/test_methods_annotation_multi_annots_to_df.py +259 -0
  51. pat2vec-0.1.1/pat2vec/tests/test_methods_get.py +229 -0
  52. pat2vec-0.1.1/pat2vec/tests/test_parse_date.py +139 -0
  53. pat2vec-0.1.1/pat2vec/tests/test_post_processing_build_ipw_dataframe.py +318 -0
  54. pat2vec-0.1.1/pat2vec/tests/test_post_processing_get_pat_ipw_record.py +484 -0
  55. pat2vec-0.1.1/pat2vec/tests/test_post_processing_process_csv_files.py +169 -0
  56. pat2vec-0.1.1/pat2vec/util/__init__.py +0 -0
  57. pat2vec-0.1.1/pat2vec/util/anonymisation_data_methods.py +243 -0
  58. pat2vec-0.1.1/pat2vec/util/anonymisation_deid_documents.py +616 -0
  59. pat2vec-0.1.1/pat2vec/util/calculate_interval.py +50 -0
  60. pat2vec-0.1.1/pat2vec/util/clinical_note_splitter.py +327 -0
  61. pat2vec-0.1.1/pat2vec/util/compile_requirements.py +97 -0
  62. pat2vec-0.1.1/pat2vec/util/config_pat2vec.py +1216 -0
  63. pat2vec-0.1.1/pat2vec/util/credentials.py +34 -0
  64. pat2vec-0.1.1/pat2vec/util/current_pat_batch_path_methods.py +84 -0
  65. pat2vec-0.1.1/pat2vec/util/dummy_data_files/__init__.py +0 -0
  66. pat2vec-0.1.1/pat2vec/util/dummy_data_files/dummy_lists.py +1899 -0
  67. pat2vec-0.1.1/pat2vec/util/elasticsearch_methods.py +284 -0
  68. pat2vec-0.1.1/pat2vec/util/ethnicity_abstractor.py +730 -0
  69. pat2vec-0.1.1/pat2vec/util/evaluation_methods.py +232 -0
  70. pat2vec-0.1.1/pat2vec/util/evaluation_methods_ploting.py +116 -0
  71. pat2vec-0.1.1/pat2vec/util/filter_dataframe_by_timestamp.py +86 -0
  72. pat2vec-0.1.1/pat2vec/util/filter_methods.py +213 -0
  73. pat2vec-0.1.1/pat2vec/util/generate_date_list.py +145 -0
  74. pat2vec-0.1.1/pat2vec/util/get_best_gpu.py +31 -0
  75. pat2vec-0.1.1/pat2vec/util/get_dummy_data_cohort_searcher.py +1864 -0
  76. pat2vec-0.1.1/pat2vec/util/get_dummy_data_medcat_annotation.py +104 -0
  77. pat2vec-0.1.1/pat2vec/util/get_start_end_year_month.py +51 -0
  78. pat2vec-0.1.1/pat2vec/util/helper_functions.py +70 -0
  79. pat2vec-0.1.1/pat2vec/util/impute_data_for_pipe.py +148 -0
  80. pat2vec-0.1.1/pat2vec/util/logger_setup.py +122 -0
  81. pat2vec-0.1.1/pat2vec/util/medcat_misc_methods.py +668 -0
  82. pat2vec-0.1.1/pat2vec/util/methods_annotation.py +527 -0
  83. pat2vec-0.1.1/pat2vec/util/methods_annotation_filter_annot_dataframe.py +71 -0
  84. pat2vec-0.1.1/pat2vec/util/methods_annotation_get_pat_document_annotation_batch.py +243 -0
  85. pat2vec-0.1.1/pat2vec/util/methods_annotation_json_to_dataframe.py +240 -0
  86. pat2vec-0.1.1/pat2vec/util/methods_annotation_multi_annots_to_df.py +164 -0
  87. pat2vec-0.1.1/pat2vec/util/methods_annotation_regex.py +39 -0
  88. pat2vec-0.1.1/pat2vec/util/methods_get.py +995 -0
  89. pat2vec-0.1.1/pat2vec/util/methods_get_medcat.py +121 -0
  90. pat2vec-0.1.1/pat2vec/util/methods_post_get.py +190 -0
  91. pat2vec-0.1.1/pat2vec/util/parse_date.py +74 -0
  92. pat2vec-0.1.1/pat2vec/util/post_processing.py +1485 -0
  93. pat2vec-0.1.1/pat2vec/util/post_processing_build_ipw_dataframe.py +80 -0
  94. pat2vec-0.1.1/pat2vec/util/post_processing_build_methods.py +683 -0
  95. pat2vec-0.1.1/pat2vec/util/post_processing_get_pat_ipw_record.py +379 -0
  96. pat2vec-0.1.1/pat2vec/util/post_processing_medcat.py +230 -0
  97. pat2vec-0.1.1/pat2vec/util/post_processing_process_csv_files.py +293 -0
  98. pat2vec-0.1.1/pat2vec/util/pre_get_drug_treatment_docs.py +465 -0
  99. pat2vec-0.1.1/pat2vec/util/pre_processing.py +457 -0
  100. pat2vec-0.1.1/pat2vec/util/presentation_methods.py +106 -0
  101. pat2vec-0.1.1/pat2vec/util/testing_helpers.py +29 -0
  102. pat2vec-0.1.1/pat2vec.egg-info/PKG-INFO +391 -0
  103. pat2vec-0.1.1/pat2vec.egg-info/SOURCES.txt +108 -0
  104. pat2vec-0.1.1/pat2vec.egg-info/dependency_links.txt +1 -0
  105. pat2vec-0.1.1/pat2vec.egg-info/requires.txt +51 -0
  106. pat2vec-0.1.1/pat2vec.egg-info/top_level.txt +2 -0
  107. pat2vec-0.1.1/pat2vec_env/bin/vba_extract.py +79 -0
  108. pat2vec-0.1.1/pyproject.toml +87 -0
  109. pat2vec-0.1.1/setup.cfg +4 -0
  110. pat2vec-0.1.1/setup.py +24 -0
@@ -0,0 +1,3 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include pat2vec/py.typed
pat2vec-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,391 @@
1
+ Metadata-Version: 2.4
2
+ Name: pat2vec
3
+ Version: 0.1.1
4
+ Summary: A comprehensive Python package for healthcare data engineering, designed to extract, transform, and feature engineer patient data from CogStack-based electronic health record (EHR) datalakes. It provides tools for cohort building, batch data processing, clinical note analysis, and creating machine learning-ready datasets.
5
+ Home-page: https://github.com/SamoraHunter/pat2vec.git
6
+ Author: Samora Hunter
7
+ Author-email: Samora Hunter <samorahunter@gmail.com>
8
+ Project-URL: Homepage, https://github.com/SamoraHunter/pat2vec
9
+ Project-URL: Documentation, https://samorahunter.github.io/pat2vec/
10
+ Project-URL: Repository, https://github.com/SamoraHunter/pat2vec
11
+ Project-URL: Changelog, https://github.com/SamoraHunter/pat2vec/releases
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: medcat==1.16.0
18
+ Requires-Dist: paramiko
19
+ Requires-Dist: colorama
20
+ Requires-Dist: elasticsearch==8.17.0
21
+ Requires-Dist: eland==8.17.0
22
+ Requires-Dist: faker
23
+ Requires-Dist: fuzzywuzzy
24
+ Requires-Dist: seaborn
25
+ Requires-Dist: rapidfuzz
26
+ Requires-Dist: python-pptx
27
+ Requires-Dist: ipykernel
28
+ Requires-Dist: transformers==4.48.1
29
+ Requires-Dist: accelerate==1.3.0
30
+ Requires-Dist: peft==0.8.2
31
+ Requires-Dist: huggingface-hub==0.27.1
32
+ Requires-Dist: polars
33
+ Requires-Dist: pandas>=1.5.3
34
+ Requires-Dist: numpy>=1.25.2
35
+ Provides-Extra: all
36
+ Requires-Dist: torch>=2.5.1; extra == "all"
37
+ Requires-Dist: scikit-learn>=1.6.1; extra == "all"
38
+ Requires-Dist: jupyterlab; extra == "all"
39
+ Requires-Dist: lifelines==0.28.0; extra == "all"
40
+ Requires-Dist: bokeh==3.6.2; extra == "all"
41
+ Requires-Dist: dask; extra == "all"
42
+ Requires-Dist: datasets; extra == "all"
43
+ Requires-Dist: numba; extra == "all"
44
+ Requires-Dist: statsmodels; extra == "all"
45
+ Requires-Dist: wordcloud; extra == "all"
46
+ Requires-Dist: matplotlib-venn; extra == "all"
47
+ Requires-Dist: nltk; extra == "all"
48
+ Requires-Dist: sqlalchemy; extra == "all"
49
+ Requires-Dist: openpyxl; extra == "all"
50
+ Requires-Dist: pydot; extra == "all"
51
+ Requires-Dist: pyodbc; extra == "all"
52
+ Requires-Dist: python-tds; extra == "all"
53
+ Requires-Dist: umls-api; extra == "all"
54
+ Provides-Extra: dev
55
+ Requires-Dist: pytest; extra == "dev"
56
+ Requires-Dist: nbformat; extra == "dev"
57
+ Requires-Dist: nbconvert; extra == "dev"
58
+ Requires-Dist: nbstripout; extra == "dev"
59
+ Requires-Dist: nbmake; extra == "dev"
60
+ Requires-Dist: pre-commit; extra == "dev"
61
+ Requires-Dist: sphinx==8.1.3; extra == "dev"
62
+ Requires-Dist: sphinx-rtd-theme==3.0.2; extra == "dev"
63
+ Requires-Dist: myst-parser==4.0.1; extra == "dev"
64
+ Requires-Dist: sphinx-autodoc-typehints==3.0.1; extra == "dev"
65
+ Requires-Dist: sphinxcontrib-mermaid; extra == "dev"
66
+ Dynamic: author
67
+ Dynamic: home-page
68
+
69
+ [![Documentation Status](https://github.com/SamoraHunter/pat2vec/actions/workflows/docs.yml/badge.svg)](https://samorahunter.github.io/pat2vec/)
70
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
71
+ ![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)
72
+
73
+ ## Table of Contents
74
+ - [Overview](#overview)
75
+ - [Documentation](#documentation)
76
+ - [Example Use Cases](#example-use-cases)
77
+ - [1. Patient-Level Aggregation](#1-patient-level-aggregation)
78
+ - [2. Longitudinal Time Series Construction](#2-longitudinal-time-series-construction)
79
+ - [Requirements](#requirements)
80
+ - [Features](#features)
81
+ - [📊 Diagrams](#-diagrams)
82
+ - [System Architecture & Configuration](#system-architecture--configuration)
83
+ - [Data Pipelines](#data-pipelines)
84
+ - [Methods & Post-Processing](#methods--post-processing)
85
+ - [Feature Extraction](#feature-extraction)
86
+ - [Installation](#installation)
87
+ - [Windows](#windows)
88
+ - [Unix/Linux](#unixlinux)
89
+ - [Usage](#usage)
90
+ - [FAQ](#faq)
91
+ - [Citation](#citation)
92
+ - [Contributing](#contributing)
93
+ - [Code of Conduct](#code-of-conduct)
94
+ - [License](#license)
95
+
96
+
97
+ # Overview
98
+
99
+ This tool converts individual patient records into structured time-interval feature vectors, making them suitable for filtering, aggregation, and assembly into a data matrix **D** for binary classification machine learning tasks.
100
+
101
+ ## Documentation
102
+
103
+ The full API documentation for `pat2vec` is automatically generated and hosted on GitHub Pages.
104
+
105
+ **View the Live Documentation**
106
+
107
+ ## Example Use Cases
108
+
109
+ ### 1. Patient-Level Aggregation
110
+ Compute summary statistics (e.g., the mean of *n* variables) for each unique patient, resulting in one row per patient. This is ideal for models requiring a single representation per individual.
111
+
112
+ ### 2. Longitudinal Time Series Construction
113
+ Generate a monthly time series for each patient that includes:
114
+
115
+ - Biochemistry results
116
+ - Demographic attributes
117
+ - MedCat-derived clinical text annotations
118
+
119
+ The time series spans up to 25 years retrospectively, aligned to each patient's diagnosis date, enabling a consistent retrospective view across varying start times.
120
+
121
+ ## Requirements
122
+
123
+ **Core Services:**
124
+ - **CogStack**: An operational instance for data retrieval. The required client libraries are now bundled with this project.
125
+ - **Elasticsearch**: The backend for CogStack.
126
+ - **MedCAT**: For medical concept annotation.
127
+
128
+ **Local Setup:**
129
+ - **Python**: Version 3.10 or higher.
130
+ - **Virtual Environment**: Requires the `python3-venv` package (or equivalent for your OS).
131
+ - For all other Python packages, see `requirements.txt`.
132
+
133
+ ## Features
134
+
135
+ `pat2vec` offers a flexible suite of tools for processing and analyzing patient data.
136
+
137
+ **Patient Processing**
138
+ - **Single & Batch Processing**: Process individual patients for detailed analysis or run large batches for cohort-level studies.
139
+
140
+ **Cohort Management**
141
+ - **Cohort Search & Creation**: Define and build patient cohorts using flexible search criteria.
142
+ - **Automated Control Matching**: Automatically generate random control groups for case-control studies.
143
+
144
+ **Flexible Feature Engineering**
145
+ - **Modular Feature Selection**: Choose from a wide range of feature extractors to build a custom feature space tailored to your research question.
146
+ - **Temporal Windowing**: Define precise time windows for data extraction relative to a key event (e.g., diagnosis date), including look-back and look-forward periods.
147
+
148
+ ## 📊 Diagrams
149
+
150
+ <details>
151
+ <summary>Click to view project diagrams</summary>
152
+
153
+ This project includes a collection of diagrams illustrating the system architecture, data pipelines, and feature extraction workflows. You can view the Mermaid definitions or the rendered diagrams below.
154
+
155
+ #### 📂 System Architecture & Configuration
156
+ | Diagram | Mermaid | Image |
157
+ |---|---|---|
158
+ | **System Architecture** | [assets/system_architecture.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/system_architecture.mmd) | ![System Architecture](https://github.com/SamoraHunter/pat2vec/blob/main/assets/system_architecture.png) |
159
+ | **Configuration** | [assets/config.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/config.mmd) | ![Configuration](https://github.com/SamoraHunter/pat2vec/blob/main/assets/config.svg) |
160
+
161
+ #### 🛠️ Data Pipelines
162
+ | Diagram | Mermaid | Image |
163
+ |---|---|---|
164
+ | **Data Pipeline** | [assets/data_pipeline.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/data_pipeline.mmd) | ![Data Pipeline](https://github.com/SamoraHunter/pat2vec/blob/main/assets/data_pipeline.png) |
165
+ | **Main Batch Processing** | [assets/main_batch.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/main_batch.mmd) | ![Main Batch](https://github.com/SamoraHunter/pat2vec/blob/main/assets/main_batch.svg) |
166
+ | **Example Ingestion** | [assets/example_ingestion.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/example_ingestion.mmd) | <img src="https://github.com/SamoraHunter/pat2vec/blob/main/assets/example_ingestion.png?raw=true" alt="Example Ingestion" height="400"/> |
167
+
168
+ #### 🧩 Methods & Post-Processing
169
+ | Diagram | Mermaid | Image |
170
+ |---|---|---|
171
+ | **Methods Annotation** | [assets/methods_annotation.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/methods_annotation.mmd) | ![Methods Annotation](https://github.com/SamoraHunter/pat2vec/blob/main/assets/methods_annotation.png) |
172
+ | **Post-Processing Build Methods** | [assets/post_processing_build_methods.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/post_processing_build_methods.mmd) | ![Post-Processing Build Methods](https://github.com/SamoraHunter/pat2vec/blob/main/assets/post_processing_build_methods.svg) |
173
+ | **Post-Processing Anonymisation** | [assets/post_processing_anonymisation_high_level.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/post_processing_anonymisation_high_level.mmd) | ![Post-Processing Anonymisation](https://github.com/SamoraHunter/pat2vec/blob/main/assets/post_processing_anonymisation_high_level.svg) |
174
+
175
+ #### 🔍 Feature Extraction
176
+ | Diagram | Mermaid | Image |
177
+ |---|---|---|
178
+ | **Ethnicity Abstractor** | [assets/ethnicity_abstractor.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/ethnicity_abstractor.mmd) | ![Ethnicity Abstractor](https://github.com/SamoraHunter/pat2vec/blob/main/assets/ethnicity_abstractor.svg) |
179
+ | **Get BMI** | [assets/get_bmi.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_bmi.mmd) | ![Get BMI](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_bmi.svg) |
180
+ | **Get Demographics** | [assets/get_demographics.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_demographics.mmd) | ![Get Demographics](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_demographics.svg) |
181
+ | **Get Diagnostics** | [assets/get_diagnostics.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_diagnostics.mmd) | ![Get Diagnostics](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_diagnostics.svg) |
182
+ | **Get Drugs** | [assets/get_drugs.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_drugs.mmd) | ![Get Drugs](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_drugs.svg) |
183
+ | **Get Smoking** | [assets/get_smoking.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_smoking.mmd) | ![Get Smoking](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_smoking.svg) |
184
+ | **Get News** | [assets/get_news.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_news.mmd) | ![Get News](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_news.svg) |
185
+ | **Get Dummy Data Cohort Searcher** | [assets/get_dummy_data_cohort_searcher.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_dummy_data_cohort_searcher.mmd) | ![Get Dummy Data Cohort Searcher](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_dummy_data_cohort_searcher.svg) |
186
+ | **Get Method Bloods** | [assets/get_method_bloods.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_method_bloods.mmd) | ![Get Method Bloods](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_method_bloods.svg) |
187
+ | **Get Method Patient Annotations** | [assets/get_method_pat_annotations.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_method_pat_annotations.mmd) | ![Get Method Patient Annotations](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_method_pat_annotations.svg) |
188
+ | **Get Treatment Docs (No Terms Fuzzy)** | [assets/get_treatment_docs_by_iterative_multi_term_cohort_searcher_no_terms_fuzzy.mmd](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_treatment_docs_by_iterative_multi_term_cohort_searcher_no_terms_fuzzy.mmd) | ![Get Treatment Docs (No Terms Fuzzy)](https://github.com/SamoraHunter/pat2vec/blob/main/assets/get_treatment_docs_by_iterative_multi_term_cohort_searcher_no_terms_fuzzy.svg) |
189
+
190
+ </details>
191
+
192
+ ## Installation
193
+
194
+ ### From PyPI (Recommended for Users)
195
+
196
+ Once `pat2vec` is installed, you can use it as a library in your Python projects.
197
+
198
+ 1. **Install the package:**
199
+ ```shell
200
+ pip install pat2vec
201
+ ```
202
+
203
+ 2. **Install all optional dependencies (for full functionality):**
204
+ ```shell
205
+ pip install pat2vec[all]
206
+ ```
207
+
208
+ ### From Source (For Developers/Contributors)
209
+ The following instructions are for setting up a development environment from the source code.
210
+
211
+ #### Windows
212
+
213
+ 1. **Clone the repository:**
214
+ Navigate to the directory where you want to store your projects. It's recommended to have a parent directory to hold `pat2vec` and its related assets.
215
+
216
+ ```shell
217
+ git clone https://github.com/SamoraHunter/pat2vec.git
218
+ ```
219
+
220
+ 2. **Run the installation script:**
221
+ Navigate into the cloned repository and run the batch script. This will create a Python virtual environment, install dependencies from `requirements.txt`, and set up a Jupyter kernel.
222
+
223
+ ```shell
224
+ cd pat2vec
225
+ install.bat
226
+ ```
227
+
228
+ 3. **Activate the environment:**
229
+ To use the installed packages, activate the virtual environment:
230
+ ```shell
231
+ pat2vec_env\Scripts\activate
232
+ ```
233
+
234
+ 4. **Set up for your IDE/Notebook:**
235
+ If you are using an IDE like VS Code or a Jupyter Notebook, make sure to select the `pat2vec_env` kernel to run your code.
236
+
237
+ 5. **Post-Installation Setup:**
238
+ The script sets up the Python environment, but you must manually arrange other project assets. In the parent directory of your `pat2vec` clone, you will need to:
239
+ - **Clone the helper repository**:
240
+ ```shell
241
+ git clone https://github.com/SamoraHunter/snomed_methods.git
242
+ ```
243
+ - **Add MedCAT model**: Create a `medcat_models` directory and copy your MedCAT model pack (`.zip`) into it.
244
+ - **Add credentials**: Create a `credentials.py` file. You can use `pat2vec/pat2vec/config/credentials_template.py` as a starting point.
245
+
246
+ Your final directory structure should look like the one described in the Usage section.
247
+
248
+ #### Unix/Linux
249
+
250
+ The `install_pat2vec.sh` script is the recommended way to set up a development environment on Unix-like systems. It automates the full setup, including:
251
+ - Creating a Python virtual environment (`pat2vec_env`).
252
+ - Installing Python dependencies (including development and testing tools).
253
+ - Cloning the `snomed_methods` helper repository.
254
+ - Creating required directories and template files (e.g., for MedCAT models and credentials).
255
+
256
+ To install, clone the repository, navigate into it, and run the script:
257
+ Grant execution permissions and run the script. It must be run from within the `pat2vec` directory.
258
+
259
+ ```shell
260
+ chmod +x install_pat2vec.sh
261
+ ./install_pat2vec.sh
262
+ ```
263
+
264
+ The script supports several options:
265
+ - `--proxy`: Use if you are behind a corporate proxy that mirrors Python packages.
266
+ - `--dev`: Installs development dependencies (e.g., `pytest`, `nbmake`) for running tests.
267
+ - `--all`: Installs all optional feature dependencies.
268
+ - `--force`: Removes any existing virtual environment and performs a clean installation.
269
+ - `--no-clone`: Skips cloning the `snomed_methods` repository if you already have it.
270
+
271
+ For example, to install for development behind a proxy:
272
+ ```shell
273
+ ./install_pat2vec.sh --proxy --dev
274
+ ```
275
+
276
+ After running the script, you must perform two manual steps:
277
+ The script creates a directory structure in the parent folder of `pat2vec`.
278
+ - **Place MedCAT model:** Copy your model pack into the `medcat_models` directory created by the script.
279
+ - **Populate credentials:** Edit the `credentials.py` file created by the script and fill in your details.
280
+
281
+ Finally, activate the environment to begin working:
282
+ ```shell
283
+ source pat2vec_env/bin/activate
284
+ ```
285
+
286
+ ## Usage
287
+
288
+ This guide outlines the steps to run a `pat2vec` analysis after completing the installation.
289
+
290
+ ### 1. Finalize Project Setup
291
+
292
+ Before running an analysis, ensure your project directory is set up correctly. If you used the `install_pat2vec.sh` script, much of this is done for you.
293
+
294
+ 1. **Populate `credentials.py`**: In the parent directory of your `pat2vec` clone, edit `credentials.py` with your Elasticsearch credentials.
295
+ 2. **Add MedCAT Model**: Copy your MedCAT model pack (`.zip`) into the `medcat_models` directory.
296
+
297
+ Your final directory structure should look like this:
298
+
299
+ ```
300
+ your_project_folder/
301
+ ├── credentials.py # <-- Populated with your credentials
302
+ ├── medcat_models/
303
+ │ └── your_model.zip # <-- Your MedCAT model pack
304
+ ├── snomed_methods/ # <-- Cloned helper repository
305
+ └── pat2vec/ # <-- This repository
306
+ ├── notebooks/
307
+ │ └── example_usage.ipynb
308
+ └── ...
309
+ ```
310
+
311
+ ### 2. Prepare Input Data
312
+
313
+ Create a CSV file containing your patient cohort. This file must include:
314
+ - A column named `client_idcode` with unique patient identifiers.
315
+ - Any other relevant columns, such as a diagnosis date for aligning time series data.
316
+
317
+ Place this file in an accessible location, such as a new `data` folder inside `pat2vec/notebooks/`.
318
+
319
+ ### 3. Configure and Run
320
+
321
+ The `example_usage.ipynb` notebook provides a template for running the pipeline.
322
+
323
+ 1. **Open the Notebook**: Navigate to `pat2vec/notebooks/` and open `example_usage.ipynb`.
324
+ 2. **Select the Kernel**: Ensure the `pat2vec_env` Jupyter kernel is active.
325
+ 3. **Configure the Analysis**: In the notebook, locate the `config_class`. This object controls all parameters for your run. You will need to set:
326
+ - Paths to your input cohort CSV and output directories.
327
+ - The list of features to extract.
328
+ - Time windows for data extraction (look-back/look-forward periods).
329
+ 4. **Run the Pipeline**: Execute the cells in the notebook to process your data.
330
+
331
+ > **Note:** When working with real patient data, ensure the `testing` flag in the `config_class` is set to `False`.
332
+
333
+
334
+ ## Building the Documentation
335
+
336
+ This project uses Sphinx to generate documentation from the source code's docstrings.
337
+
338
+ 1. **Install development dependencies:**
339
+ If you haven't already, run the installation script with the `--dev` flag to install Sphinx and its extensions.
340
+ ```shell
341
+ ./install_pat2vec.sh --dev
342
+ ```
343
+
344
+ 2. **Activate the virtual environment:**
345
+ ```shell
346
+ source pat2vec_env/bin/activate
347
+ ```
348
+
349
+ 3. **Build the HTML documentation:**
350
+ Navigate to the `docs/` directory and use the provided `Makefile`.
351
+ ```shell
352
+ cd docs
353
+ make html
354
+ ```
355
+
356
+ 4. **View the documentation:**
357
+ The generated files will be in `docs/build/html/`. You can open the main page in your browser:
358
+ ```
359
+ open docs/build/html/index.html
360
+ ```
361
+
362
+ ## FAQ
363
+
364
+ For answers to common questions, troubleshooting tips, and more detailed explanations of project concepts, please see our Frequently Asked Questions page.
365
+ - [Frequently Asked Questions](./docs/source/Frequently-Asked-Questions.md)
366
+
367
+ ## Citation
368
+
369
+ If you use `pat2vec` in your research, please cite it. This helps to credit the work and allows others to find the tool.
370
+
371
+ ```bibtex
372
+ @software{hunter_pat2vec_2024,
373
+ author = {Hunter, Samora},
374
+ title = {pat2vec: A tool for transforming EHR data into feature vectors for machine learning},
375
+ year = {2024},
376
+ publisher = {GitHub},
377
+ journal = {GitHub repository},
378
+ howpublished = {\url{https://github.com/SamoraHunter/pat2vec}}
379
+ }
380
+ ```
381
+
382
+ ## Contributing
383
+
384
+ Contributions are welcome! Please see the contributing guidelines for more information.
385
+
386
+ ## Code of Conduct
387
+
388
+ This project and everyone participating in it is governed by a Code of Conduct. By participating, you are expected to uphold this code. Please report any unacceptable behavior.
389
+
390
+ ## License
391
+ This project is licensed under the MIT License - see the LICENSE file for details