project-ryland 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Justin Vinh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.4
2
+ Name: project_ryland
3
+ Version: 2.0.1
4
+ Summary: This project develops standardized tools to use LLMs in research studies for improving patient care.
5
+ Author-email: Justin Vinh <jvinh21@gmail.com>
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Classifier: Programming Language :: Python :: 3
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas>=2.0
11
+ Requires-Dist: numpy>=1.26
12
+ Requires-Dist: matplotlib>=3.9
13
+ Requires-Dist: scikit-learn>=1.5
14
+ Requires-Dist: lifelines>=0.28
15
+ Requires-Dist: tqdm>=4.66
16
+ Requires-Dist: numexpr>=2.10.2
17
+ Requires-Dist: loguru>=0.7
18
+ Requires-Dist: orjson>=3.10
19
+ Requires-Dist: pyyaml>=6.0
20
+ Requires-Dist: environs>=9.5
21
+ Requires-Dist: openai>=1.43
22
+ Requires-Dist: azure-identity>=1.17
23
+ Requires-Dist: azure-core>=1.30
24
+ Requires-Dist: pydantic>=2.6
25
+ Requires-Dist: python-dateutil>=2.9
26
+ Requires-Dist: requests>=2.31
27
+
28
+ # project_ryland
29
+
30
+ <a target="_blank" href="https://cookiecutter-data-science.drivendata.org/">
31
+ <img src="https://img.shields.io/badge/CCDS-Project%20template-328F97?logo=cookiecutter" />
32
+ </a>
33
+
34
+ This project develops standardized tools to use LLMs in research studies for improving patient care.
35
+
36
+ RYLAND stands for Research sYstem for LLM-based Analytics of Novel Data. Ryland is the protagonist of Justin's favorite book (he'll leave it to you to figure out which one)
37
+
38
+ Ignore the file tree - it needs to be updated.
39
+
40
+ ## Project Organization
41
+
42
+ ```
43
+ ├── LICENSE <- Open-source license if one is chosen
44
+ ├── Makefile <- Makefile with convenience commands like `make data` or `make train`
45
+ ├── README.md <- The top-level README for developers using this project.
46
+ ├── data
47
+ │ ├── external <- Data from third party sources.
48
+ │ ├── interim <- Intermediate data that has been transformed.
49
+ │ ├── processed <- The final, canonical data sets for modeling.
50
+ │ └── raw <- The original, immutable data dump.
51
+
52
+ ├── docs <- A default mkdocs project; see www.mkdocs.org for details
53
+
54
+ ├── models <- Trained and serialized models, model predictions, or model summaries
55
+
56
+ ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
57
+ │ the creator's initials, and a short `-` delimited description, e.g.
58
+ │ `1.0-jqp-initial-data-exploration`.
59
+
60
+ ├── pyproject.toml <- Project configuration file with package metadata for
61
+ │ project_ryland_code and configuration for tools like black
62
+
63
+ ├── references <- Data dictionaries, manuals, and all other explanatory materials.
64
+
65
+ ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc.
66
+ │ └── figures <- Generated graphics and figures to be used in reporting
67
+
68
+ ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
69
+ │ generated with `pip freeze > requirements.txt`
70
+
71
+ ├── setup.cfg <- Configuration file for flake8
72
+
73
+ └── project_ryland_code <- Source code for use in this project.
74
+
75
+ ├── __init__.py <- Makes project_ryland_code a Python module
76
+
77
+ ├── config.py <- Store useful variables and configuration
78
+
79
+ ├── dataset.py <- Scripts to download or generate data
80
+
81
+ ├── features.py <- Code to create features for modeling
82
+
83
+ ├── modeling
84
+ │ ├── __init__.py
85
+ │ ├── predict.py <- Code to run model inference with trained models
86
+ │ └── train.py <- Code to train models
87
+
88
+ └── plots.py <- Code to create visualizations
89
+ ```
90
+
91
+ --------
92
+
93
+
@@ -0,0 +1,65 @@
1
+ # project_ryland
2
+
3
+ <a target="_blank" href="https://cookiecutter-data-science.drivendata.org/">
4
+ <img src="https://img.shields.io/badge/CCDS-Project%20template-328F97?logo=cookiecutter" />
5
+ </a>
6
+
7
+ This project develops standardized tools to use LLMs in research studies for improving patient care.
8
+
9
+ RYLAND stands for Research sYstem for LLM-based Analytics of Novel Data. Ryland is the protagonist of Justin's favorite book (he'll leave it to you to figure out which one)
10
+
11
+ Ignore the file tree - it needs to be updated.
12
+
13
+ ## Project Organization
14
+
15
+ ```
16
+ ├── LICENSE <- Open-source license if one is chosen
17
+ ├── Makefile <- Makefile with convenience commands like `make data` or `make train`
18
+ ├── README.md <- The top-level README for developers using this project.
19
+ ├── data
20
+ │ ├── external <- Data from third party sources.
21
+ │ ├── interim <- Intermediate data that has been transformed.
22
+ │ ├── processed <- The final, canonical data sets for modeling.
23
+ │ └── raw <- The original, immutable data dump.
24
+
25
+ ├── docs <- A default mkdocs project; see www.mkdocs.org for details
26
+
27
+ ├── models <- Trained and serialized models, model predictions, or model summaries
28
+
29
+ ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
30
+ │ the creator's initials, and a short `-` delimited description, e.g.
31
+ │ `1.0-jqp-initial-data-exploration`.
32
+
33
+ ├── pyproject.toml <- Project configuration file with package metadata for
34
+ │ project_ryland_code and configuration for tools like black
35
+
36
+ ├── references <- Data dictionaries, manuals, and all other explanatory materials.
37
+
38
+ ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc.
39
+ │ └── figures <- Generated graphics and figures to be used in reporting
40
+
41
+ ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
42
+ │ generated with `pip freeze > requirements.txt`
43
+
44
+ ├── setup.cfg <- Configuration file for flake8
45
+
46
+ └── project_ryland_code <- Source code for use in this project.
47
+
48
+ ├── __init__.py <- Makes project_ryland_code a Python module
49
+
50
+ ├── config.py <- Store useful variables and configuration
51
+
52
+ ├── dataset.py <- Scripts to download or generate data
53
+
54
+ ├── features.py <- Code to create features for modeling
55
+
56
+ ├── modeling
57
+ │ ├── __init__.py
58
+ │ ├── predict.py <- Code to run model inference with trained models
59
+ │ └── train.py <- Code to train models
60
+
61
+ └── plots.py <- Code to create visualizations
62
+ ```
63
+
64
+ --------
65
+
@@ -0,0 +1,3 @@
1
+ from importlib.metadata import version
2
+
3
+ __version__ = version("project_ryland")
@@ -0,0 +1,44 @@
1
+ """
2
+ ------------------------------------------------------------------------------
3
+ Author: Justin Vinh
4
+ Collaborators: Thomas Sounack
5
+ Parent Package: Project Ryland
6
+ Creation Date: 2025.10.13
7
+
8
+ Purpose:
9
+ Set up meta info for each model, including cost and API type
10
+ ------------------------------------------------------------------------------
11
+ """
12
+
13
+ # ============================================================================
14
+ # LAST UPDATED: 2025.01.28
15
+ # ============================================================================
16
+
17
+ # Cost metadata for each model
18
+ llm_model_meta = {
19
+ 'gpt-4o-2024-05-13-api': {
20
+ 'cost_per_1M_token_input': 5.00,
21
+ 'cost_per_1M_token_output': 15.00,
22
+ 'type': 'GPT4DFCI'
23
+ },
24
+ 'gpt-4o-2024-08-06': {
25
+ 'cost_per_1M_token_input': 2.50,
26
+ 'cost_per_1M_token_output': 10.00,
27
+ 'type': 'OpenAI'
28
+ },
29
+ 'gpt-4o-mini-2024-07-18-api': {
30
+ 'cost_per_1M_token_input': 0.15,
31
+ 'cost_per_1M_token_output': 0.60,
32
+ 'type': 'GPT4DFCI'
33
+ },
34
+ 'gpt-4o': {
35
+ 'cost_per_1M_token_input': 2.50,
36
+ 'cost_per_1M_token_output': 10.00,
37
+ 'type': 'GPT4DFCI'
38
+ },
39
+ 'gpt-5': {
40
+ 'cost_per_1M_token_input': 1.25,
41
+ 'cost_per_1M_token_output': 10.00,
42
+ 'type': 'GPT4DFCI'
43
+ }
44
+ }
@@ -0,0 +1,652 @@
1
+ """
2
+ ------------------------------------------------------------------------------
3
+ Author: Justin Vinh
4
+ Collaborators: Thomas Sounack
5
+ Institution: Dana-Farber Cancer Institute
6
+ Working Groups: Lindvall & Rhee Labs
7
+ Parent Package: Project Ryland
8
+ Creation Date: 2025.10.06
9
+ Last Modified: 2025.11.24
10
+
11
+ Purpose:
12
+ Contain the functions necessary to pull the proper LLM prompt and
13
+ then connect to the OpenAI API to run the promopt on given data
14
+ ------------------------------------------------------------------------------
15
+ """
16
+
17
+ import glob
18
+ import json
19
+ import logging
20
+ import os
21
+ import re
22
+ import sys
23
+ from datetime import datetime
24
+ from pathlib import Path
25
+ from typing import List, Dict, Any
26
+
27
+ import openai
28
+ import pandas as pd
29
+ import yaml
30
+ from azure.identity import DefaultAzureCredential, get_bearer_token_provider
31
+ from environs import Env
32
+ from openai import AzureOpenAI, OpenAI
33
+ from pydantic import ValidationError
34
+ from tqdm import tqdm
35
+
36
+ from .llm_config import llm_model_meta
37
+ from project_ryland import __version__
38
+
39
+ # --- Configure logging ---
40
+ logger = logging.getLogger()
41
+ logger.setLevel(logging.INFO)
42
+
43
+ # Clear existing handlers
44
+ logger.handlers = []
45
+
46
+ # File handler
47
+ file_handler = logging.FileHandler("llm_tracking.log")
48
+ file_handler.setFormatter(logging.Formatter(
49
+ "%(asctime)s | %(message)s", "%Y-%m-%d %H:%M:%S"
50
+ ))
51
+ logger.addHandler(file_handler)
52
+
53
+ # Silence noisy libraries
54
+ logging.getLogger("openai").setLevel(logging.WARNING)
55
+ logging.getLogger("httpx").setLevel(logging.WARNING)
56
+ # --- Configure logging ---
57
+
58
+
59
+ def retrieve_llm_prompt(
60
+ prompt_text: str = None,
61
+ use_prompt_gallery: bool = False,
62
+ prompt_name: str = None,
63
+ prompt_gallery_path: str = None) -> Dict[str, str]:
64
+ """
65
+ Retrieve a specific LLM prompt from the centralized prompt gallery.
66
+ Looks up the prompt_name in the YAML registry, loads the associated .txt file,
67
+ and returns the full text. Optionally returns YAML metadata as well.
68
+ """
69
+
70
+ # Use the prompt gallery if available and specified to do so
71
+ if use_prompt_gallery:
72
+ # define the prompt gallery root and prompt config file
73
+ if prompt_gallery_path is None:
74
+ print('[ERROR] Using prompt gallery but gallery path not provided.')
75
+ gallery_dir = prompt_gallery_path
76
+ prompt_config_path = f"{gallery_dir}/config_llm_prompts.yaml"
77
+
78
+ # Open reference YAML file and handle potential errors
79
+ try:
80
+ with open(prompt_config_path, 'r') as f:
81
+ prompts = yaml.safe_load(f)
82
+ except FileNotFoundError:
83
+ raise FileNotFoundError(
84
+ f'[ERROR] Could not find prompt config file: {prompt_config_path}. '
85
+ f'Check file or path to prompt gallery.'
86
+ )
87
+ except yaml.YAMLError as e:
88
+ raise ValueError(f'Error parsing prompt config file: {e}')
89
+
90
+ # Validate prompt name before moving forward
91
+ if prompt_name not in prompts:
92
+ raise KeyError(f'[ERROR] Prompt {prompt_name} not found in {prompt_config_path}')
93
+
94
+ # Retrieve prompt metadata
95
+ prompt_meta = prompts[prompt_name]
96
+ prompt_filename = f"{gallery_dir}/{prompt_meta['filename']}"
97
+
98
+ # Based on the reference file and prompt name, load prompt (and handle errors)
99
+ try:
100
+ with open(prompt_filename, 'r') as f:
101
+ prompt_text = f.read().strip()
102
+ except FileNotFoundError:
103
+ raise FileNotFoundError(f'[ERROR] Prompt file not found: {prompt_filename}')
104
+
105
+ else:
106
+ # If the user inserts *only* the prompt text without using the prompt gallery
107
+ # feature, use the inputted text and create dummy metadata
108
+ prompt_text = prompt_text
109
+ prompt_meta = {'filename': 'Filename (Not Applicable)',
110
+ 'description': 'Description Unknown',
111
+ 'author': 'Author Unknown',
112
+ 'date': 'Date Unknown'}
113
+
114
+ # Return the prompt and the metadata as a dict
115
+ return {'prompt_text': prompt_text, 'metadata': prompt_meta}
116
+
117
+
118
+ def retrieve_llm_prompt_with_inserted_variables(
119
+ prompt_name: str = None,
120
+ prompt_text: str = None,
121
+ use_prompt_gallery: bool = False,
122
+ prompt_gallery_path: str = None,
123
+ user_prompt_vars: Dict[str, str] = None) -> Dict[str, str]:
124
+ """
125
+ Retrive a stored prompt template, check for any placeholder variables
126
+ (denoted {variable} in the prompt), and dynamically fill them in with
127
+ user-provided values
128
+ """
129
+ # Retrieve the prompt (format: {'prompt_text': <string>, 'metadata': <dict>})
130
+ prompt = retrieve_llm_prompt(
131
+ prompt_text=prompt_text,
132
+ use_prompt_gallery=use_prompt_gallery,
133
+ prompt_gallery_path=prompt_gallery_path,
134
+ prompt_name=prompt_name
135
+ )
136
+
137
+ # Find what variable(s) are in the prompt:
138
+ text = prompt['prompt_text']
139
+ prompt_vars = re.findall(r'{(.*?)}', text)
140
+ if prompt_vars:
141
+ print(f'[INFO] Placeholder variables are found in the prompt: {prompt_vars}')
142
+ else:
143
+ print(f'[INFO] No placeholder variables found in the prompt')
144
+
145
+ # If placeholders exist but no user variables provided
146
+ if prompt_vars and not user_prompt_vars:
147
+ print('[WARNING] Prompt contains placeholder variables '
148
+ 'but no user variables were provided.')
149
+ print(f'[WARNING] These placeholders still need values: {prompt_vars}')
150
+ return prompt
151
+
152
+ # If all these prompt variables are not accounted for in the user-defined
153
+ # variables, throw up a warning
154
+ all_vars_accounted = True
155
+ for var in prompt_vars:
156
+ if var not in user_prompt_vars:
157
+ all_vars_accounted = False
158
+ print(f'[ERROR] Variable "{var}" in given prompt NOT defined by user.'
159
+ f' MUST FIX')
160
+ if not all_vars_accounted:
161
+ return
162
+
163
+ # Replace prompt placeholder variables with the user-defined variables
164
+ if prompt_vars:
165
+ user_prompt_vars_clean = {
166
+ k: ', '.join(v) if isinstance(v, list) else v
167
+ for k, v in user_prompt_vars.items()
168
+ }
169
+ prompt['prompt_text'] = prompt['prompt_text'].format(**user_prompt_vars_clean)
170
+ print(f'[INFO] Prompt successfully retrieved + '
171
+ f'placeholder variables replaced by user-defined values:')
172
+ for k, v in user_prompt_vars_clean.items():
173
+ print(f'[INFO] Placeholder:\t\t\t{k} \n[INFO] User value(s):\t\t{v}')
174
+ print('')
175
+ else:
176
+ print(f'[INFO] Prompt successfully retrieved\n')
177
+
178
+ return prompt
179
+
180
+
181
+ class LLMCostTracker:
182
+ def __init__(self, model_name):
183
+ """Cost tracker for LLM API usage"""
184
+ self.input_cost = 0
185
+ self.output_cost = 0
186
+ self.total_cost = 0
187
+ # Initiate known per-one-million token costs based on model name
188
+ model_meta = llm_model_meta[model_name]
189
+ self.input_1M_token_cost = model_meta['cost_per_1M_token_input']
190
+ self.output_1M_token_cost = model_meta['cost_per_1M_token_output']
191
+
192
+ def update_cost(self, llm_output_meta):
193
+ """Tracks cumulative costs"""
194
+ # Calculate costs
195
+ input_tokens = llm_output_meta.usage.prompt_tokens
196
+ output_tokens = llm_output_meta.usage.completion_tokens
197
+ input_cost = self.input_1M_token_cost * input_tokens / 1e6
198
+ output_cost = self.output_1M_token_cost * output_tokens / 1e6
199
+
200
+ # Update costs
201
+ self.input_cost += input_cost
202
+ self.output_cost += output_cost
203
+ self.total_cost = self.input_cost + self.output_cost
204
+
205
+ # Add cumulative costs to a dict, handle special case if costs < $0.01
206
+ tracker_output = {
207
+ 'Input': f'${'<0.01'
208
+ if self.input_cost < 0.01
209
+ else f'{self.input_cost:.2f}'}',
210
+ 'Output': f'${'<0.01'
211
+ if self.output_cost < 0.01
212
+ else f'{self.output_cost:.2f}'}',
213
+ 'Total': f'${'<0.01'
214
+ if self.total_cost < 0.01
215
+ else f'{self.total_cost:.2f}'}'
216
+ }
217
+ # logging.info(tracker_output) # Uncomment if you want cum. costs per row
218
+
219
+ return tracker_output
220
+
221
+ def summary(self):
222
+ return {
223
+ 'input_cost': self.input_cost,
224
+ 'output_cost': self.output_cost,
225
+ 'total_cost': self.total_cost,
226
+ }
227
+
228
+
229
+ class LLM_wrapper:
230
+ def __init__(
231
+ self,
232
+ model_name: str,
233
+ endpoint: str = None,
234
+ entra_scope: str = None,
235
+ api_test_key: str = None,
236
+ env_abs_path: str = None):
237
+ """Set up token provider and Azure OpenAI client"""
238
+ # Sets up the environment depending on what was read from the .env file
239
+
240
+ if (endpoint is None and
241
+ entra_scope is None and
242
+ api_test_key is None):
243
+ # Set up environment
244
+ env = Env()
245
+ try:
246
+ env.read_env()
247
+ except OSError:
248
+ if env_abs_path is not None and env_abs_path.exists():
249
+ env.read_env(env_abs_path)
250
+ print("Loaded .env from", env_abs_path)
251
+ elif env_abs_path is None:
252
+ print('[ERROR] No .env file found. Please specify an absolute path')
253
+ else:
254
+ print("[ERROR] No .env file found at", env_abs_path)
255
+ sys.path.append('../')
256
+
257
+ endpoint = env.str('ENDPOINT', None)
258
+ entra_scope = env.str('ENTRA_SCOPE', None)
259
+ api_test_key = env.str("API_TEST_KEY", None)
260
+
261
+ self.API_TYPE = None
262
+
263
+ # Detects which variables are present depending on whether the public OpenAI API
264
+ # or the GPT4DFCI key is being used based on the API key values given
265
+
266
+ if endpoint and entra_scope:
267
+ # Detected Azure (GPT4DFCI) environment
268
+ print(f'[INFO] Detected Azure OpenAI (GPT4DFCI) configuration')
269
+ self.API_TYPE = "AZURE"
270
+ token_provider = get_bearer_token_provider(
271
+ DefaultAzureCredential(),
272
+ entra_scope
273
+ )
274
+ self.client = OpenAI(
275
+ base_url=endpoint,
276
+ api_key=token_provider,
277
+ )
278
+ elif api_test_key:
279
+ # Detected standard OpenAI environment
280
+ print(f'[INFO] Detected standard OpenAI configuration')
281
+ self.API_TYPE = 'OPENAI'
282
+ self.client = OpenAI(api_key=api_test_key)
283
+ else:
284
+ raise EnvironmentError(
285
+ "No valid API credentials found. "
286
+ "Please set ENDPOINT + ENTRA_SCOPE (for Azure) or "
287
+ "API_TEST_KEY (for OpenAI Cloud)."
288
+ )
289
+
290
+ self.model_name = model_name
291
+
292
+ # Set up utility functions
293
+ # -------------------------------------------------------------------------
294
+ @staticmethod
295
+ def remove_strict_field(data: List[Dict[str, Any]]) \
296
+ -> List[Dict[str, Any]]:
297
+ """Remove unsupported "strict" fields in the schema (function dict)"""
298
+ for item in data:
299
+ item['function'].pop('strict', None)
300
+ return data
301
+
302
+ @staticmethod
303
+ def extract_name_value(data: List[Dict[str, Any]]) -> str:
304
+ """Extract the function name from the function dict"""
305
+ return data[0]['function']['name']
306
+
307
+ @staticmethod
308
+ def load_prompt(
309
+ use_prompt_gallery: bool = False,
310
+ prompt_gallery_path: str = None,
311
+ prompt_name: str = None,
312
+ prompt_text: str = None,
313
+ user_prompt_vars: Dict[str, str] = None,
314
+ return_matadata: bool=False) -> str:
315
+ """
316
+ Load a specific prompt from the centralized prompt gallery.
317
+ Print metadata if desired
318
+ """
319
+ prompt = retrieve_llm_prompt_with_inserted_variables(
320
+ prompt_name=prompt_name,
321
+ prompt_text=prompt_text,
322
+ use_prompt_gallery=use_prompt_gallery,
323
+ prompt_gallery_path=prompt_gallery_path,
324
+ user_prompt_vars=user_prompt_vars
325
+ )
326
+ if return_matadata:
327
+ print(f'[INFO] Prompt Info...')
328
+ for key, value in prompt['metadata'].items():
329
+ print(f'{key}: {value}')
330
+ print('')
331
+ return prompt['prompt_text']
332
+
333
+ # Set up the API interaction
334
+ # -------------------------------------------------------------------------
335
+ def openai_chat_completion_response(
336
+ self,
337
+ prompt: str,
338
+ input_text: str,
339
+ format_class,
340
+ cost_tracker: LLMCostTracker):
341
+ """Call the Azure OpenAI API with structured response parsing"""
342
+
343
+ # Sets up a parameter set for the chat completion response
344
+ # Will add to this set based on API type or model type
345
+ chat_response_params = {
346
+ 'model': self.model_name,
347
+ 'messages': [{"role": "system", "content": prompt},
348
+ {"role": "user", "content": input_text}],
349
+ }
350
+
351
+ # Sets the temperature to 0 if using any model other than gpt-5
352
+ if 'gpt-5' not in self.model_name:
353
+ chat_response_params['temperature'] = 0.0
354
+
355
+ try:
356
+ # Uses the chat response pathway for the new DFCI Azure API
357
+ if self.API_TYPE == 'AZURE':
358
+ chat_response_params['response_format'] = format_class
359
+ completion = self.client.beta.chat.completions.parse(
360
+ **chat_response_params
361
+ )
362
+ return completion.choices[0].message.parsed, completion
363
+
364
+ # Uses the chat response pathway for the public OpenAI API
365
+ elif self.API_TYPE == 'OPENAI':
366
+ schema = [openai.pydantic_function_tool(format_class)]
367
+ schema_clean = self.remove_strict_field(schema)
368
+ function_name = self.extract_name_value(schema_clean)
369
+
370
+ chat_response_params['tools'] = schema
371
+ chat_response_params['tool_choice'] = {
372
+ 'type': 'function',
373
+ 'function': {'name': function_name}
374
+ }
375
+
376
+ # Allow only 3 retries in calling the API
377
+ for attempt in range(3):
378
+ completion = self.client.chat.completions.create(
379
+ **chat_response_params
380
+ )
381
+ if completion:
382
+ response = (completion.choices[0]
383
+ .message.tool_calls[0]
384
+ .function.arguments)
385
+ return [json.loads(response), completion]
386
+
387
+ # Handle various errors
388
+ except openai.APIError as e:
389
+ # Handle API error here, e.g. retry or log
390
+ print(f"OpenAI API returned an API Error: {e}")
391
+ pass
392
+ except openai.APIConnectionError as e:
393
+ # Handle connection error here
394
+ print(f"Failed to connect to OpenAI API: {e}")
395
+ pass
396
+ except openai.RateLimitError as e:
397
+ # Handle rate limit error (we recommend using exponential backoff)
398
+ print(f"OpenAI API request exceeded rate limit: {e}")
399
+ pass
400
+ except ValidationError as ve:
401
+ print(f"Pydantic validation error: {ve}")
402
+ raise
403
+
404
+ # Set up data handling functions
405
+ # -------------------------------------------------------------------------
406
+ def load_input_file(self, input_file: str,
407
+ text_column: str,
408
+ sample_mode: bool = False) \
409
+ -> pd.DataFrame:
410
+ """Load input CSV file and validate columns"""
411
+ print(f'[INFO] Reading input data from \n{input_file}\n')
412
+ df = pd.read_csv(input_file)
413
+
414
+ if text_column not in df.columns:
415
+ raise ValueError(f"Missing required col {text_column} in input file")
416
+
417
+ if sample_mode:
418
+ return df.head(10)
419
+ return df
420
+
421
+ @ staticmethod
422
+ def flatten_data_old(data: Dict[str, Any]) -> pd.Series:
423
+ """
424
+ Recursively flatten dict data. This is the old version of the function and
425
+ remains for legacy purposes
426
+ """
427
+ flat = {}
428
+ for key, value in data.items():
429
+ if isinstance(value, dict):
430
+ flat[f'{key}_documentation_llm'] = value.get('documentation', None)
431
+ flat[f'{key}_text_llm'] = value.get('text', None)
432
+ else:
433
+ flat[key] = value
434
+ return pd.Series(flat)
435
+
436
+ def flatten_data(self, data: dict) -> pd.Series:
437
+ """
438
+ Recursively flatten nested dicts (or Pydantic objects converted to dicts)
439
+ """
440
+ flattened_data = {}
441
+
442
+ def _flatten(prefix, value):
443
+ if isinstance(value, dict):
444
+ for k, v in value.items():
445
+ _flatten(f"{prefix}_{k}" if prefix else k, v)
446
+ elif isinstance(value, list):
447
+ # Flatten lists by JSON-stringifying
448
+ flattened_data[prefix] = json.dumps(value)
449
+ else:
450
+ flattened_data[prefix] = value
451
+
452
+ _flatten("", data)
453
+ return pd.Series(flattened_data)
454
+
455
+ # Set up data processing pipeline
456
+ # -------------------------------------------------------------------------
457
+ def process_text_data(
458
+ self,
459
+ input_file_path,
460
+ text_column,
461
+ format_class,
462
+ use_prompt_gallery: bool = False,
463
+ prompt_gallery_path: str = None,
464
+ prompt_to_get: str = None,
465
+ prompt_text: str = None,
466
+ user_prompt_vars = None,
467
+ sample_mode: bool = False,
468
+ flatten: bool = True,
469
+ save_every: int = 10,
470
+ output_dir: str = '../tmp',
471
+ keep_checkpoints: bool = False,
472
+ resume: bool = True
473
+ ):
474
+ """
475
+ Process text data with the LLM and auto-generates unique output filenames.
476
+ """
477
+ # Log start of run
478
+ logging.info(f'[INFO] project_ryland version {__version__}')
479
+ logging.info('[INFO] New LLM generation run starting...')
480
+ logging.info(f'[INFO] Loading data from: {input_file_path}')
481
+ print(f'[INFO] Project Ryland: v{__version__}')
482
+
483
+ # Ensure output dir exists
484
+ os.makedirs(output_dir, exist_ok=True)
485
+
486
+ # Generate the timestamped final output and checkpoint names
487
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M')
488
+ base_prefix = f'{self.model_name}_{timestamp}'
489
+ checkpoint_path = os.path.join(output_dir, f'checkpoint_{base_prefix}.csv')
490
+ final_output_path = os.path.join(output_dir, f'final_{base_prefix}.csv')
491
+
492
+ print(f'[INFO] Output directory: {output_dir}')
493
+ print(f'[INFO] Checkpoint file: {checkpoint_path}')
494
+ print(f'[INFO] Final output: {final_output_path}')
495
+
496
+ # Set up checkpointing and prompts
497
+ prompt = self.load_prompt(
498
+ use_prompt_gallery=use_prompt_gallery,
499
+ prompt_gallery_path=prompt_gallery_path,
500
+ prompt_text=prompt_text,
501
+ prompt_name=prompt_to_get,
502
+ user_prompt_vars=user_prompt_vars,
503
+ return_matadata=True)
504
+
505
+ logging.info(f'[INFO] Prompt loaded: {prompt_to_get}')
506
+
507
+ # Check for existing checkpoint files to resume from, else start anew
508
+ # Work only on rows without a generation yet
509
+ df = None
510
+ if resume:
511
+ existing_checkpoints = sorted(
512
+ Path(output_dir).glob(f'checkpoint_{self.model_name}*.csv'),
513
+ key = os.path.getmtime,
514
+ reverse = True
515
+ )
516
+ if existing_checkpoints:
517
+ latest = existing_checkpoints[0]
518
+ print(f'[INFO] Resuming from checkpoint: {latest.name}')
519
+ df = pd.read_csv(latest)
520
+ if 'generation' not in df.columns:
521
+ df['generation'] = None
522
+ if df is None:
523
+ df = self.load_input_file(input_file_path, text_column, sample_mode=sample_mode)
524
+ df['generation'] = None
525
+ df['generation'] = df['generation'].astype('object')
526
+
527
+ # Print/log checkpoint stats
528
+ unprocessed_df = df[df['generation'].isna()]
529
+ logging.info(
530
+ f"[INFO] CHECKPOINT: "
531
+ f"Total: {len(df)}, "
532
+ f"Processed: {len(df) - len(unprocessed_df)}, "
533
+ f"Remaining: {len(unprocessed_df)}"
534
+ )
535
+ print(f"[INFO] CHECKPOINT → "
536
+ f"Total: {len(df)}, "
537
+ f"Processed: {len(df) - len(unprocessed_df)}, "
538
+ f"Remaining: {len(unprocessed_df)}\n"
539
+ )
540
+
541
+ # Start the cost tracker and progress bar
542
+ now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
543
+ start_time = datetime.now()
544
+ print(f'[INFO] Starting LLM API call ({now})')
545
+ cost_tracker = LLMCostTracker(self.model_name)
546
+ # Sets up the progress bar
547
+ bar = tqdm(unprocessed_df.iterrows(),
548
+ total=len(unprocessed_df),
549
+ desc=f'Processing data')
550
+
551
+ # Row by row, generate the LLM response to the input data
552
+ for i, (idx, row) in enumerate(bar):
553
+ try:
554
+ input_text = row[text_column]
555
+ response, completion = self.openai_chat_completion_response(
556
+ prompt,
557
+ input_text,
558
+ format_class,
559
+ cost_tracker)
560
+ # df.at[idx, 'generation'] = response
561
+
562
+ if hasattr(response, "model_dump"): # Pydantic v2
563
+ df.at[idx, "generation"] = json.dumps(response.model_dump())
564
+ elif hasattr(response, "dict"): # Pydantic v1
565
+ df.at[idx, "generation"] = json.dumps(response.dict())
566
+ else:
567
+ df.at[idx, "generation"] = json.dumps(response)
568
+
569
+ # Add the costs to the progress bar
570
+ bar.set_postfix(cost_tracker.update_cost(completion))
571
+
572
+ except Exception as e:
573
+ tqdm.write(f'Error with row {idx} → Error: {e}')
574
+ df.at[idx, 'generation'] = None
575
+
576
+ # Save checkpoints every X rows (user-specified)
577
+ if (i + 1) % save_every == 0 or i == len(unprocessed_df) - 1:
578
+ with open(checkpoint_path, 'w') as f:
579
+ df.to_csv(f, index=False)
580
+ now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
581
+ logging.info(f'[INFO] Saved checkpoint at row {i+1}')
582
+ # Uncomment if you want to show the checkpoint saved in console
583
+ #tqdm.write(f'[INFO] {now} Saved checkpoint at row {i+1}')
584
+
585
+ # Log the final cost of LLM generation in the log file
586
+ logging.info(f'[INFO] Cost: {cost_tracker.update_cost(completion)}')
587
+
588
+ # Flatten the output generation data if desired
589
+ if flatten:
590
+ if self.API_TYPE == 'OPENAI':
591
+ # Flatten once at end
592
+ flattened_df = df['generation'].apply(
593
+ lambda x: self.flatten_data_old(x)
594
+ if isinstance(x, dict)
595
+ else pd.Series()
596
+ )
597
+ df = pd.concat([df, flattened_df], axis=1)
598
+
599
+ elif self.API_TYPE == 'AZURE':
600
+ def _safe_flatten(x):
601
+ if pd.isna(x) or x in ("None", "nan"):
602
+ return pd.Series()
603
+ try:
604
+ # Convert stringified JSON back to dict
605
+ if isinstance(x, str):
606
+ x = json.loads(x)
607
+ return self.flatten_data(x)
608
+ except Exception as e:
609
+ print(f"Flattening error: {e}")
610
+ return pd.Series()
611
+
612
+ # Flatten once at end
613
+ flattened_df = df["generation"].apply(_safe_flatten)
614
+ new_cols = [c for c in flattened_df.columns if c not in df.columns]
615
+ if new_cols:
616
+ df = pd.concat([df, flattened_df[new_cols]], axis=1)
617
+
618
+ # Save the final LLM output
619
+ df.to_csv(final_output_path, index=False)
620
+
621
+ # Display the completion time and print message
622
+ now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
623
+ end_time = datetime.now()
624
+ duration = (end_time - start_time)
625
+ duration_minutes = duration.total_seconds() / 60
626
+ print(f'\n[SUCCESS] LLM generation run completed ({now} '
627
+ f'| Duration: {duration_minutes:.2f} min.)')
628
+ print(f'[SUCCESS] Final LLM output saved: {final_output_path}')
629
+ logging.info(f'[SUCCESS] LLM generation run completed '
630
+ f'(Duration: {duration_minutes:.2f} min.)')
631
+ logging.info(f'[SUCCESS] Final LLM output saved: {final_output_path}')
632
+
633
+ # Get rid of old checkpoints
634
+ if not keep_checkpoints:
635
+ for f in glob.glob(os.path.join(
636
+ output_dir, f'checkpoint_{self.model_name}*.csv')
637
+ ):
638
+ try:
639
+ os.remove(f)
640
+ print(f'[CLEANUP] Deleted checkpoint(s): {f}')
641
+ logging.info(f'[CLEANUP] Deleted checkpoint: {f}'
642
+ f'\n---------------------------------------'
643
+ f'---------------------------------------')
644
+ except Exception as e:
645
+ print(f'[WARNING] Could not delete checkpoint: {f}: {e}')
646
+ else:
647
+ print(f'[INFO] Keeping all checkpoints in {output_dir}')
648
+ logging.info(f'[INFO] Keeping all checkpoints in {output_dir}'
649
+ f'\n-------------------------------------------'
650
+ f'---------------------------------------')
651
+
652
+ return df
@@ -0,0 +1,65 @@
1
+ [build-system]
2
+ requires = ["flit_core >=3.2,<4"]
3
+ build-backend = "flit_core.buildapi"
4
+
5
+ [project]
6
+ name = "project_ryland"
7
+ version = "2.0.1"
8
+ description = "This project develops standardized tools to use LLMs in research studies for improving patient care."
9
+ authors = [
10
+ {name = "Justin Vinh", email = "jvinh21@gmail.com"}
11
+ ]
12
+
13
+ readme = "README.md"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+
17
+ ]
18
+
19
+ requires-python = ">=3.10"
20
+
21
+ dependencies = [
22
+ # Core scientific stack
23
+ "pandas>=2.0",
24
+ "numpy>=1.26",
25
+ "matplotlib>=3.9",
26
+ "scikit-learn>=1.5",
27
+ "lifelines>=0.28",
28
+ "tqdm>=4.66",
29
+ "numexpr>=2.10.2",
30
+
31
+ # Logging and utilities
32
+ "loguru>=0.7",
33
+ "orjson>=3.10",
34
+
35
+ # Configuration / environment parsing
36
+ "pyyaml>=6.0",
37
+ "environs>=9.5",
38
+
39
+ # Azure + OpenAI integration
40
+ "openai>=1.43",
41
+ "azure-identity>=1.17",
42
+ "azure-core>=1.30",
43
+
44
+ # Data validation / models
45
+ "pydantic>=2.6",
46
+
47
+ # Type hints, date parsing, etc.
48
+ "python-dateutil>=2.9",
49
+
50
+ # Optional: useful helpers
51
+ "requests>=2.31",
52
+ ]
53
+
54
+ [tool.ruff]
55
+ line-length = 99
56
+ src = ["project_ryland"]
57
+ include = ["pyproject.toml", "project_ryland_code/**/*.py"]
58
+
59
+ [tool.ruff.lint]
60
+ extend-select = ["I"] # Add import sorting
61
+
62
+ [tool.ruff.lint.isort]
63
+ known-first-party = ["project_ryland_code"]
64
+ force-sort-within-sections = true
65
+