dev-laiser 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dev_laiser-0.2.0.dist-info/LICENSE +30 -0
- dev_laiser-0.2.0.dist-info/METADATA +147 -0
- dev_laiser-0.2.0.dist-info/RECORD +10 -0
- dev_laiser-0.2.0.dist-info/WHEEL +5 -0
- dev_laiser-0.2.0.dist-info/top_level.txt +1 -0
- laiser/__init__.py +0 -0
- laiser/llm_methods.py +382 -0
- laiser/params.py +60 -0
- laiser/skill_extractor.py +367 -0
- laiser/utils.py +122 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Copyright (c) 2025, LAiSER.
|
|
2
|
+
All rights reserved.
|
|
3
|
+
|
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
|
5
|
+
modification, are permitted provided that the following conditions are
|
|
6
|
+
met:
|
|
7
|
+
|
|
8
|
+
* Redistributions of source code must retain the above copyright
|
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
* Redistributions in binary form must reproduce the above
|
|
12
|
+
copyright notice, this list of conditions and the following
|
|
13
|
+
disclaimer in the documentation and/or other materials provided
|
|
14
|
+
with the distribution.
|
|
15
|
+
|
|
16
|
+
* Neither the name of the LAiSER nor the names of any
|
|
17
|
+
contributors may be used to endorse or promote products derived
|
|
18
|
+
from this software without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
21
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
22
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
23
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
24
|
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
25
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
26
|
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
27
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
28
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
29
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
30
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: dev-laiser
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: LAiSER (Leveraging Artificial Intelligence for Skill Extraction & Research) is a tool designed to help learners, educators, and employers extract and share trusted information about skills. It uses a fine-tuned language model to extract raw skill keywords from text, then aligns them with a predefined taxonomy. You can find more technical details in the project’s paper.md and an overview in the README.md.
|
|
5
|
+
Home-page: https://github.com/LAiSER-Software/extract-module
|
|
6
|
+
Author: Satya Phanindra Kumar Kalaga, Bharat Khandelwal, Prudhvi Chekuri
|
|
7
|
+
Author-email: phanindra.connect@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.6
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: pandas
|
|
16
|
+
Requires-Dist: psutil
|
|
17
|
+
Requires-Dist: scikit_learn
|
|
18
|
+
Requires-Dist: skillNer
|
|
19
|
+
Requires-Dist: spacy
|
|
20
|
+
Requires-Dist: transformers
|
|
21
|
+
Requires-Dist: accelerate
|
|
22
|
+
Requires-Dist: bitsandbytes
|
|
23
|
+
Requires-Dist: datasets
|
|
24
|
+
Requires-Dist: huggingface_hub
|
|
25
|
+
Requires-Dist: peft
|
|
26
|
+
Requires-Dist: torch
|
|
27
|
+
Requires-Dist: trl
|
|
28
|
+
Requires-Dist: ipython
|
|
29
|
+
Requires-Dist: python-dotenv
|
|
30
|
+
Requires-Dist: vllm
|
|
31
|
+
Requires-Dist: tqdm
|
|
32
|
+
Dynamic: author
|
|
33
|
+
Dynamic: author-email
|
|
34
|
+
Dynamic: classifier
|
|
35
|
+
Dynamic: description
|
|
36
|
+
Dynamic: description-content-type
|
|
37
|
+
Dynamic: home-page
|
|
38
|
+
Dynamic: requires-dist
|
|
39
|
+
Dynamic: requires-python
|
|
40
|
+
Dynamic: summary
|
|
41
|
+
|
|
42
|
+
<div align="center">
|
|
43
|
+
<img src="https://i.imgur.com/XznvjNi.png" width="70%"/>
|
|
44
|
+
<h2>Leveraging ​Artificial ​Intelligence for ​Skill ​Extraction &​ Research (LAiSER)</h2>
|
|
45
|
+
</div>
|
|
46
|
+
|
|
47
|
+
### Contents
|
|
48
|
+
LAiSER is a tool that helps learners, educators and employers share trusted and mutually intelligible information about skills​.
|
|
49
|
+
|
|
50
|
+
- [About](#about)
|
|
51
|
+
- [Requirements](#requirements)
|
|
52
|
+
- [Setup and Installation](#setup-and-installation)
|
|
53
|
+
- [i. Download the repository](#i-download-the-repository)
|
|
54
|
+
- [ii. Install the dependencies](#ii-install-the-dependencies)
|
|
55
|
+
- [Usage](#usage)
|
|
56
|
+
- [Google Colab Setup(preferred)](#google-colab-setuppreferred)
|
|
57
|
+
- [Command Line Setup](#command-line-setup)
|
|
58
|
+
- [Funding](#funding)
|
|
59
|
+
- [Authors](#authors)
|
|
60
|
+
- [Partners](#partners)
|
|
61
|
+
<!-- - [Examples](#examples) -->
|
|
62
|
+
- [Funding](#funding)
|
|
63
|
+
- [Authors](#authors)
|
|
64
|
+
- [Partners](#partners)
|
|
65
|
+
|
|
66
|
+
## About
|
|
67
|
+
## Requirements
|
|
68
|
+
- Python version >= Python 3.12.
|
|
69
|
+
- A GPU with atelast 15GB video memory is essential for running this tool on large datasets.
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Setup and Installation
|
|
73
|
+
|
|
74
|
+
### i. Download the repository
|
|
75
|
+
Before proceeding to LAiSER, you'd want to follow the steps below to install the required dependencies:
|
|
76
|
+
- Clone the repository using
|
|
77
|
+
```shell
|
|
78
|
+
git clone https://github.com/Micah-Sanders/LAiSER.git
|
|
79
|
+
```
|
|
80
|
+
or download the [zip(link)](https://github.com/Micah-Sanders/LAiSER/archive/refs/heads/main.zip) file and extract it.
|
|
81
|
+
|
|
82
|
+
### ii. Install the dependencies
|
|
83
|
+
> [!NOTE]
|
|
84
|
+
> If you intend to use the Jupyter Notebook interface, you can skip this step as the dependencies will be installed seperately in the Google Colab environment.
|
|
85
|
+
|
|
86
|
+
Install the required dependencies using the command below:
|
|
87
|
+
```shell
|
|
88
|
+
pip install -r requirements.txt
|
|
89
|
+
```
|
|
90
|
+
**NOTE**: Python 3.9 or later, *preferably 3.12*, is expected to be installed on your system. If you don't have Python installed, you can download it from [here](https://www.python.org/downloads/).
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
## Usage
|
|
94
|
+
|
|
95
|
+
As of now LAiSER can be used a command line tool or from the Jupyter notebook(Google Colab). The steps to setup the tool are as follows:
|
|
96
|
+
|
|
97
|
+
### Google Colab Setup(preferred)
|
|
98
|
+
LAiSER's Jupyter notebook is, currently, the fastest way to get started with the tool. You can access the notebook [here](https://github.com/LAiSER-Software/extract-module/blob/main/dev_space/Extract%20Function%20Colab%20Execution.ipynb).
|
|
99
|
+
|
|
100
|
+
- Once the notebook is imported in google colaboratory, connect to a GPU-accelerated runtime(T4 GPU) and run the cells in the notebook.
|
|
101
|
+
|
|
102
|
+
### Command Line Setup
|
|
103
|
+
To use LAiSER as a command line tool, follow the steps below:
|
|
104
|
+
|
|
105
|
+
- Navigate to the root directory of the repository and run the command below:
|
|
106
|
+
```shell
|
|
107
|
+
pip install laiser-dev
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
- Once the installation is complete, you can run the tool using the command below:
|
|
111
|
+
<!-- TODO: add an example of importing and initiating the skillExtractor class -->
|
|
112
|
+
```shell
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
<!-- > [!CAUTION]
|
|
117
|
+
> - If you encounter any `*.dll` file missing errors, make sure you downgrade the pytorch version to `2.2.2`.
|
|
118
|
+
```shell
|
|
119
|
+
pip install pytorch=2.2.2
|
|
120
|
+
``` -->
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
<!-- ## Examples -->
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
## Funding
|
|
127
|
+
<div align="center">
|
|
128
|
+
<img src="https://i.imgur.com/XtgngBz.png" width="100px"/>
|
|
129
|
+
<img src="https://i.imgur.com/a2SNYma.jpeg" width="130px"/>
|
|
130
|
+
</div>
|
|
131
|
+
|
|
132
|
+
## Authors
|
|
133
|
+
<a href="https://github.com/LAiSER-Software/extract-module/graphs/contributors">
|
|
134
|
+
<img src="https://contrib.rocks/image?repo=LAiSER-Software/extract-module" />
|
|
135
|
+
</a>
|
|
136
|
+
|
|
137
|
+
## Partners
|
|
138
|
+
<div align="center">
|
|
139
|
+
<img src="https://i.imgur.com/hMb5n6T.png" width="120px"/>
|
|
140
|
+
<img src="https://i.imgur.com/dxz2Udo.png" width="70px"/>
|
|
141
|
+
<img src="https://i.imgur.com/5O1EuFU.png" width="100px"/>
|
|
142
|
+
</div>
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
</br>
|
|
147
|
+
<!-- <p align='center'> <b> Made with Passion💖, Data Science📊, and a little magic!🪄 </b></p> -->
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
laiser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
laiser/llm_methods.py,sha256=6uMAdRwTDSxc0KIVlOIkN3sDs37EUcT-LJwaXp9qKwQ,15306
|
|
3
|
+
laiser/params.py,sha256=FuCMi_GYYFUQ3e_SL-5S-ieGtXFXrTxh2w9wBIg6s4A,2350
|
|
4
|
+
laiser/skill_extractor.py,sha256=wzonq0FQDw3bjTdxul2o8e1GD19wmYKR02dp4nr0T6g,15655
|
|
5
|
+
laiser/utils.py,sha256=AU2ClhkbG2GXygKEOc-C96iX_K-IM21cA2zyWPWWDe0,3881
|
|
6
|
+
dev_laiser-0.2.0.dist-info/LICENSE,sha256=yzvCtVuCrHO8861qLJbm1a2-XQmzh8VDXdcV-JXx1Pc,1546
|
|
7
|
+
dev_laiser-0.2.0.dist-info/METADATA,sha256=rhYE1utXzJ5uHhtfvmGAKqpzAHAokxAVeGvNDDId5X8,5431
|
|
8
|
+
dev_laiser-0.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
9
|
+
dev_laiser-0.2.0.dist-info/top_level.txt,sha256=uiCW59yq2qCbyBOTlTtRRhtCMLjPM91nSoELBVJ4QB0,7
|
|
10
|
+
dev_laiser-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
laiser
|
laiser/__init__.py
ADDED
|
File without changes
|
laiser/llm_methods.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module Description:
|
|
3
|
+
-------------------
|
|
4
|
+
Class to extract skills from text and align them to existing taxonomy
|
|
5
|
+
|
|
6
|
+
Ownership:
|
|
7
|
+
----------
|
|
8
|
+
Project: Leveraging Artificial intelligence for Skills Extraction and Research (LAiSER)
|
|
9
|
+
Owner: George Washington University Institute of Public Policy
|
|
10
|
+
Program on Skills, Credentials and Workforce Policy
|
|
11
|
+
Media and Public Affairs Building
|
|
12
|
+
805 21st Street NW
|
|
13
|
+
Washington, DC 20052
|
|
14
|
+
PSCWP@gwu.edu
|
|
15
|
+
https://gwipp.gwu.edu/program-skills-credentials-workforce-policy-pscwp
|
|
16
|
+
|
|
17
|
+
License:
|
|
18
|
+
--------
|
|
19
|
+
Copyright 2024 George Washington University Institute of Public Policy
|
|
20
|
+
|
|
21
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
|
22
|
+
documentation files (the “Software”), to deal in the Software without restriction, including without limitation
|
|
23
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
24
|
+
and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
25
|
+
|
|
26
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
|
|
27
|
+
Software.
|
|
28
|
+
|
|
29
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
30
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
31
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
32
|
+
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
Input Requirements:
|
|
36
|
+
-------------------
|
|
37
|
+
- All the libraries in the requirements.txt should be installed
|
|
38
|
+
|
|
39
|
+
Output/Return Format:
|
|
40
|
+
----------------------------
|
|
41
|
+
- List of extracted skills from text
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
"""
|
|
45
|
+
Revision History:
|
|
46
|
+
-----------------
|
|
47
|
+
Rev No. Date Author Description
|
|
48
|
+
[1.0.0] 07/10/2024 Satya Phanindra K. Define all the LLM methods being used in the project
|
|
49
|
+
[1.0.1] 07/19/2024 Satya Phanindra K. Add descriptions to each method
|
|
50
|
+
[1.0.2] 11/24/2024 Prudhvi Chekuri Add support for skills extraction from syllabi data
|
|
51
|
+
[1.0.3] 11/25/2024 Satya Phanindra K. Add support for skills extraction from course outcomes data
|
|
52
|
+
|
|
53
|
+
TODO:
|
|
54
|
+
-----
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
import re
|
|
59
|
+
import torch
|
|
60
|
+
import numpy as np
|
|
61
|
+
from trl import SFTTrainer
|
|
62
|
+
from vllm import SamplingParams
|
|
63
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
64
|
+
|
|
65
|
+
torch.cuda.empty_cache()
|
|
66
|
+
|
|
67
|
+
def fetch_model_output(response):
|
|
68
|
+
"""
|
|
69
|
+
Format the model's output to extract the skill keywords from the get_completion() response
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
input_text : text
|
|
74
|
+
The model's response after processing the prompt.
|
|
75
|
+
Contains special tags to identify the start and end of the model's response.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
list: List of extracted skills from text
|
|
80
|
+
|
|
81
|
+
"""
|
|
82
|
+
# Find the content between the model start tag and the last <eos> tag
|
|
83
|
+
pattern = r'<start_of_turn>model\s*<eos>(.*?)<eos>\s*$'
|
|
84
|
+
match = re.search(pattern, response, re.DOTALL)
|
|
85
|
+
|
|
86
|
+
if match:
|
|
87
|
+
content = match.group(1).strip()
|
|
88
|
+
|
|
89
|
+
# Split the content by lines and filter out empty lines
|
|
90
|
+
lines = [line.strip() for line in content.split('\n') if line.strip()]
|
|
91
|
+
|
|
92
|
+
# Extract skills (lines starting with '-')
|
|
93
|
+
skills = [line[1:].strip() for line in lines if line.startswith('-')]
|
|
94
|
+
|
|
95
|
+
return skills
|
|
96
|
+
|
|
97
|
+
def get_completion_batch(queries: list, model, tokenizer, batch_size=2) -> list:
|
|
98
|
+
"""
|
|
99
|
+
Get completions for a list of queries using the model
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
queries : list
|
|
104
|
+
List of queries to get completions for using the model
|
|
105
|
+
model : model
|
|
106
|
+
The model to use for generating completions
|
|
107
|
+
tokenizer : tokenizer
|
|
108
|
+
The tokenizer to use for encoding the queries
|
|
109
|
+
batch_size : int, optional
|
|
110
|
+
Preferred batch size to use for generating completions
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
list: List of extracted skills from the text(s)
|
|
115
|
+
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
device = "cuda:0"
|
|
119
|
+
results = []
|
|
120
|
+
|
|
121
|
+
prompt_template = """
|
|
122
|
+
<start_of_turn>user
|
|
123
|
+
Name all the skills present in the following description in a single list. Response should be in English and have only the skills, no other information or words. Skills should be keywords, each being no more than 3 words.
|
|
124
|
+
Below text is the Description:
|
|
125
|
+
|
|
126
|
+
{query}
|
|
127
|
+
<end_of_turn>\n<start_of_turn>model
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
for i in range(0, len(queries), batch_size):
|
|
131
|
+
batch = queries[i:i+batch_size]
|
|
132
|
+
prompts = [prompt_template.format(query=query) for query in batch]
|
|
133
|
+
|
|
134
|
+
encodeds = tokenizer(prompts, return_tensors="pt", add_special_tokens=True, padding=True, truncation=True)
|
|
135
|
+
model_inputs = encodeds.to(device)
|
|
136
|
+
|
|
137
|
+
with torch.no_grad():
|
|
138
|
+
generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
|
|
139
|
+
|
|
140
|
+
decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
|
|
141
|
+
|
|
142
|
+
for full_output in decoded:
|
|
143
|
+
# Extract only the model's response
|
|
144
|
+
response = full_output.split("<start_of_turn>model<eos>")[-1].strip()
|
|
145
|
+
processed_response = fetch_model_output(response)
|
|
146
|
+
results.append(processed_response)
|
|
147
|
+
|
|
148
|
+
# Clear CUDA cache after each batch
|
|
149
|
+
torch.cuda.empty_cache()
|
|
150
|
+
|
|
151
|
+
print(f"Processed batch {i//batch_size + 1}/{(len(queries)-1)//batch_size + 1}")
|
|
152
|
+
|
|
153
|
+
return results
|
|
154
|
+
|
|
155
|
+
def get_completion(input_text, text_columns, input_type, model, tokenizer) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Get completion for a single query using the model
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
input_text : pandas Series with text data related to Job Description / Syllabus Description / Course Outcomes etc.
|
|
162
|
+
The query to get completions for using the model
|
|
163
|
+
text_columns : list
|
|
164
|
+
List of columns in the input_text dataframe that contain the text data. (Default: ['description'])
|
|
165
|
+
input_type : str
|
|
166
|
+
Type of input data - 'job_desc' / 'syllabus' etc. (Default: 'job_desc')
|
|
167
|
+
model : model
|
|
168
|
+
The model to use for generating completions
|
|
169
|
+
tokenizer : tokenizer
|
|
170
|
+
The tokenizer to use for encoding the queries
|
|
171
|
+
|
|
172
|
+
Returns
|
|
173
|
+
-------
|
|
174
|
+
list: List of extracted skills from the text
|
|
175
|
+
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
device = "cuda:0"
|
|
179
|
+
|
|
180
|
+
if input_type == "job_desc":
|
|
181
|
+
prompt_template = """
|
|
182
|
+
<start_of_turn>user
|
|
183
|
+
Name all the skills present in the following description in a single list. Response should be in English and have only the skills, no other information or words. Skills should be keywords, each being no more than 3 words.
|
|
184
|
+
Below text is the Description:
|
|
185
|
+
|
|
186
|
+
{query}
|
|
187
|
+
<end_of_turn>\n<start_of_turn>model
|
|
188
|
+
"""
|
|
189
|
+
prompt = prompt_template.format(query=input_text[text_columns[0]])
|
|
190
|
+
elif input_type == "syllabus":
|
|
191
|
+
prompt_template = """
|
|
192
|
+
<start_of_turn>user
|
|
193
|
+
Name all the skills present in the following course details in a single list. Response should be in English and have only the skills, no other information or words. Skills should be keywords, each being no more than 3 words.
|
|
194
|
+
|
|
195
|
+
Course Description:
|
|
196
|
+
{description}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
Learning Outcomes:
|
|
200
|
+
{learning_outcomes}
|
|
201
|
+
|
|
202
|
+
<end_of_turn>
|
|
203
|
+
<start_of_turn>model
|
|
204
|
+
"""
|
|
205
|
+
prompt = prompt_template.format(description=input_text[text_columns[0]], learning_outcomes=input_text[text_columns[1]])
|
|
206
|
+
|
|
207
|
+
encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
|
|
208
|
+
|
|
209
|
+
model_inputs = encodeds.to(device)
|
|
210
|
+
|
|
211
|
+
with torch.no_grad():
|
|
212
|
+
generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
|
|
216
|
+
decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
|
|
217
|
+
response = decoded.split("<start_of_turn>model<eos>")[-1].strip()
|
|
218
|
+
processed_response = fetch_model_output(response)
|
|
219
|
+
return (processed_response)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def parse_output_vllm(response):
|
|
223
|
+
# TODO: Verify the docstring and update missing/incorrect information
|
|
224
|
+
"""
|
|
225
|
+
Parse the output from the VLLM model to extract skills, levels, knowledge required, and task abilities.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
response : str
|
|
230
|
+
The model's response containing the structured information about skills.
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
list: List of dictionaries containing the extracted skills, levels, knowledge required, and task abilities.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
out = []
|
|
238
|
+
# Split into items, handling optional '->' prefix and multi-line input
|
|
239
|
+
items = [item.strip() for item in response.split('->') if item.strip()]
|
|
240
|
+
|
|
241
|
+
for item in items:
|
|
242
|
+
skill_data = {}
|
|
243
|
+
|
|
244
|
+
# Extract skill
|
|
245
|
+
skill_match = re.search(r"Skill:\s*([^,\n]+)", item)
|
|
246
|
+
if skill_match:
|
|
247
|
+
skill_data['Skill'] = skill_match.group(1).strip()
|
|
248
|
+
|
|
249
|
+
# Extract level
|
|
250
|
+
level_match = re.search(r"Level:\s*(\d+)", item)
|
|
251
|
+
if level_match:
|
|
252
|
+
skill_data['Level'] = int(level_match.group(1).strip())
|
|
253
|
+
|
|
254
|
+
# Extract knowledge required (multi-line support with re.DOTALL)
|
|
255
|
+
knowledge_match = re.search(r"Knowledge Required:\s*(.*?)(?=\s*Task Abilities:|\s*$)", item, re.DOTALL)
|
|
256
|
+
if knowledge_match:
|
|
257
|
+
knowledge_raw = knowledge_match.group(1).strip()
|
|
258
|
+
skill_data['Knowledge Required'] = [k.strip() for k in knowledge_raw.split(',') if k.strip()]
|
|
259
|
+
|
|
260
|
+
# Extract task abilities (multi-line support with re.DOTALL)
|
|
261
|
+
task_match = re.search(r"Task Abilities:\s*(.*?)(?=\s*$)", item, re.DOTALL)
|
|
262
|
+
if task_match:
|
|
263
|
+
task_raw = task_match.group(1).strip()
|
|
264
|
+
skill_data['Task Abilities'] = [t.strip() for t in task_raw.split(',') if t.strip()]
|
|
265
|
+
|
|
266
|
+
out.append(skill_data)
|
|
267
|
+
|
|
268
|
+
return out
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def create_ksa_prompt(query, input_type, num_key_skills, num_key_kr, num_key_tas):
|
|
272
|
+
# TODO: Verify the docstring and update missing/incorrect information
|
|
273
|
+
"""
|
|
274
|
+
Create a structured prompt for the KSA (Knowledge, Skills, Abilities) extraction task.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
query : dict
|
|
279
|
+
A dictionary containing the input data, including 'description' and optionally 'learning_outcomes'.
|
|
280
|
+
input_type : str
|
|
281
|
+
The type of input data - 'job_desc' or 'syllabi'.
|
|
282
|
+
num_key_skills : int
|
|
283
|
+
The number of key skills to extract.
|
|
284
|
+
num_key_kr : str
|
|
285
|
+
The number of key knowledge areas to extract (e.g., '3-5').
|
|
286
|
+
num_key_tas : str
|
|
287
|
+
The number of key task abilities to extract (e.g., '3-5').
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
str
|
|
291
|
+
The formatted prompt for the KSA extraction task.
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
prompt_template = """user
|
|
295
|
+
**Objective:** Given a {input_desc}, complete the following tasks with structured outputs.
|
|
296
|
+
|
|
297
|
+
### Tasks:
|
|
298
|
+
1. **Skills Extraction:** Identify {num_key_skills} key skills mentioned in the {input_desc}.
|
|
299
|
+
- Extract skill keywords or phrases of no more than three words.
|
|
300
|
+
|
|
301
|
+
2. **Skill Level Assignment:** Assign a proficiency level to each extracted skill based on the SCQF Level Descriptors (see below).
|
|
302
|
+
|
|
303
|
+
3. **Knowledge Required:** For each skill, list {num_key_kr} broad areas of understanding or expertise necessary to develop the skill.
|
|
304
|
+
|
|
305
|
+
4. **Task Abilities:** For each skill, list {num_key_tas} general tasks or capabilities enabled by the skill.
|
|
306
|
+
|
|
307
|
+
### Guidelines:
|
|
308
|
+
- **Skill Extraction:** Identify skills explicitly stated or implied through {input_desc}.
|
|
309
|
+
- **Skill Level Assignment:** Use the SCQF Level Descriptors to classify proficiency:
|
|
310
|
+
- 1: Basic awareness of simple concepts.
|
|
311
|
+
- 2: Limited operational understanding, guided application.
|
|
312
|
+
- 3: Moderate knowledge, supervised application of techniques.
|
|
313
|
+
- 4: Clear understanding, independent work in familiar contexts.
|
|
314
|
+
- 5: Advanced knowledge, autonomous problem-solving.
|
|
315
|
+
- 6: Specialized knowledge, critical analysis within defined areas.
|
|
316
|
+
- 7: Advanced specialization, leadership in problem-solving.
|
|
317
|
+
- 8: Expert knowledge, innovation in complex contexts.
|
|
318
|
+
- 9: Highly specialized expertise, contributing original thought.
|
|
319
|
+
- 10: Sustained mastery, influential in areas of specialization.
|
|
320
|
+
- 11: Groundbreaking innovation, professional or academic mastery.
|
|
321
|
+
- 12: Global expertise, leading advancements at the highest level.
|
|
322
|
+
|
|
323
|
+
- **Knowledge and Task Abilities:**
|
|
324
|
+
- **Knowledge Required:** Broad areas, e.g., "data visualization techniques."
|
|
325
|
+
- **Task Abilities:** General tasks or capabilities, e.g., "data analysis."
|
|
326
|
+
- Each item in these two lists should be no more than three words.
|
|
327
|
+
- Avoid overly specific or vague terms.
|
|
328
|
+
|
|
329
|
+
### Answer Format:
|
|
330
|
+
- Use this format strictly in the response:
|
|
331
|
+
-> Skill: [Skill Name], Level: [1–12], Knowledge Required: [list], Task Abilities: [list].
|
|
332
|
+
|
|
333
|
+
{input_text}
|
|
334
|
+
|
|
335
|
+
**Response:** Provide only the requested structured information without additional explanations.
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
model
|
|
339
|
+
"""
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
input_desc = "job description" if input_type == "syllabi" else "course syllabus description and its learning outcomes"
|
|
343
|
+
|
|
344
|
+
if input_type == "syllabi":
|
|
345
|
+
input_text = f"""### Input:\n**Course Description:** {query["description"]}\n**Learning Outcomes:** {query["learning_outcomes"]}"""
|
|
346
|
+
else:
|
|
347
|
+
input_text = f"""### Input:\n{query["description"]}"""
|
|
348
|
+
|
|
349
|
+
prompt = prompt_template.format(input_desc=input_desc, num_key_skills=num_key_skills, num_key_kr=num_key_kr, num_key_tas=num_key_tas, input_text=input_text)
|
|
350
|
+
return prompt
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def vllm_batch_generate(llm, queries, input_type, batch_size=32, num_key_skills=5, num_key_kr='3-5', num_key_tas='3-5'):
|
|
354
|
+
|
|
355
|
+
result = []
|
|
356
|
+
|
|
357
|
+
sampling_params = SamplingParams(max_tokens=1000)
|
|
358
|
+
|
|
359
|
+
for i in range(0, len(queries), batch_size):
|
|
360
|
+
prompts = [create_ksa_prompt(queries.iloc[i], input_type, num_key_skills, num_key_kr, num_key_tas) for i in range(i, min(i+batch_size, len(queries)))]
|
|
361
|
+
output = llm.generate(prompts, sampling_params=sampling_params)
|
|
362
|
+
|
|
363
|
+
result.extend(output)
|
|
364
|
+
|
|
365
|
+
return result
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def get_completion_vllm(input_text, text_columns, id_column, input_type, llm, batch_size=4) -> list:
|
|
369
|
+
|
|
370
|
+
result = vllm_batch_generate(llm, input_text, input_type=input_type, batch_size=batch_size)
|
|
371
|
+
|
|
372
|
+
parsed_output = []
|
|
373
|
+
for i in range(len(result)):
|
|
374
|
+
parsed = parse_output_vllm(result[i].outputs[0].text)
|
|
375
|
+
for item in parsed:
|
|
376
|
+
item[id_column] = input_text.iloc[i][id_column]
|
|
377
|
+
item['description'] = input_text.iloc[i]['description']
|
|
378
|
+
if 'learning_outcomes' in input_text.columns:
|
|
379
|
+
item['learning_outcomes'] = input_text.iloc[i]['learning_outcomes']
|
|
380
|
+
parsed_output.extend(parsed)
|
|
381
|
+
|
|
382
|
+
return parsed_output
|
laiser/params.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Module Description:
|
|
4
|
+
-------------------
|
|
5
|
+
A python file with global constants
|
|
6
|
+
|
|
7
|
+
Ownership:
|
|
8
|
+
----------
|
|
9
|
+
Project: Leveraging Artificial intelligence for Skills Extraction and Research (LAiSER)
|
|
10
|
+
Owner: George Washington University Institute of Public Policy
|
|
11
|
+
Program on Skills, Credentials and Workforce Policy
|
|
12
|
+
Media and Public Affairs Building
|
|
13
|
+
805 21st Street NW
|
|
14
|
+
Washington, DC 20052
|
|
15
|
+
PSCWP@gwu.edu
|
|
16
|
+
https://gwipp.gwu.edu/program-skills-credentials-workforce-policy-pscwp
|
|
17
|
+
|
|
18
|
+
License:
|
|
19
|
+
--------
|
|
20
|
+
Copyright 2024 George Washington University Institute of Public Policy
|
|
21
|
+
|
|
22
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
|
23
|
+
documentation files (the “Software”), to deal in the Software without restriction, including without limitation
|
|
24
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
25
|
+
and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
26
|
+
|
|
27
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
|
|
28
|
+
Software.
|
|
29
|
+
|
|
30
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
31
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
32
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
33
|
+
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
"""
|
|
37
|
+
Revision History
|
|
38
|
+
-----------------
|
|
39
|
+
Rev No. Date Author Description
|
|
40
|
+
[1.0.0] 06/01/2024 Vedant M. Initial Version
|
|
41
|
+
[1.0.1] 06/10/2024 Vedant M. added paths for input and output
|
|
42
|
+
[1.0.2] 07/01/2024 Satya Phanindra K. updated threshold for similarity and AI model ID
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
TODO:
|
|
46
|
+
-----
|
|
47
|
+
- 1:
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
import os
|
|
51
|
+
from dotenv import load_dotenv
|
|
52
|
+
|
|
53
|
+
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
54
|
+
INPUT_PATH = os.path.join(ROOT_DIR, 'input')
|
|
55
|
+
OUTPUT_PATH = os.path.join(ROOT_DIR, 'output')
|
|
56
|
+
|
|
57
|
+
SKILL_DB_PATH = os.path.join(INPUT_PATH, 'combined.csv')
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
SIMILARITY_THRESHOLD = 0.85
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module Description:
|
|
3
|
+
-------------------
|
|
4
|
+
Class to extract skills from text and align them to existing taxonomy
|
|
5
|
+
|
|
6
|
+
Ownership:
|
|
7
|
+
----------
|
|
8
|
+
Project: Leveraging Artificial intelligence for Skills Extraction and Research (LAiSER)
|
|
9
|
+
Owner: George Washington University Institute of Public Policy
|
|
10
|
+
Program on Skills, Credentials and Workforce Policy
|
|
11
|
+
Media and Public Affairs Building
|
|
12
|
+
805 21st Street NW
|
|
13
|
+
Washington, DC 20052
|
|
14
|
+
PSCWP@gwu.edu
|
|
15
|
+
https://gwipp.gwu.edu/program-skills-credentials-workforce-policy-pscwp
|
|
16
|
+
|
|
17
|
+
License:
|
|
18
|
+
--------
|
|
19
|
+
Copyright 2024 George Washington University Institute of Public Policy
|
|
20
|
+
|
|
21
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
|
22
|
+
documentation files (the “Software”), to deal in the Software without restriction, including without limitation
|
|
23
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
24
|
+
and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
25
|
+
|
|
26
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
|
|
27
|
+
Software.
|
|
28
|
+
|
|
29
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
30
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
31
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
32
|
+
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
Input Requirements:
|
|
36
|
+
-------------------
|
|
37
|
+
- Pandas Dataframe with ID and Text Column
|
|
38
|
+
|
|
39
|
+
Output/Return Format:
|
|
40
|
+
----------------------------
|
|
41
|
+
- Pandas dataframe with below columns:
|
|
42
|
+
- "Research ID": text_id
|
|
43
|
+
- "Skill Name": Raw skill extracted,
|
|
44
|
+
- "Skill Tag": skill tag from taxonomy,
|
|
45
|
+
- "Correlation Coefficient": similarity_score
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
"""
|
|
50
|
+
Revision History:
|
|
51
|
+
-----------------
|
|
52
|
+
Rev No. Date Author Description
|
|
53
|
+
[1.0.0] 05/30/2024 Vedant M. Initial Version
|
|
54
|
+
[1.0.1] 06/01/2024 Vedant M. Referencing utils.py and params.py
|
|
55
|
+
[1.0.2] 06/08/2024 Satya Phanindra K. Modify get_aligned_skills function to JSON output
|
|
56
|
+
[1.0.3] 06/10/2024 Vedant M. Updated functions extract_raw and align_skills for input and output
|
|
57
|
+
[1.0.4] 06/13/2024 Vedant M. Added function extractor to encapsulate both functions
|
|
58
|
+
[1.0.5] 06/15/2024 Satya Phanindra K. Replaced OpenAI API with HuggingFace API for skill extraction
|
|
59
|
+
[1.0.6] 06/20/2024 Satya Phanindra K. Added function to extract skills from text using Fine-Tuned Language Model's API
|
|
60
|
+
[1.0.7] 07/03/2024 Satya Phanindra K. Added CONDITIONAL GPU support for Fine-Tuned Language Model and error handling
|
|
61
|
+
[1.0.9] 07/08/2024 Satya Phanindra K. Added support for SkillNer model for skill extraction, if GPU not available
|
|
62
|
+
[1.0.8] 07/11/2024 Satya Phanindra K. Calculate cosine similarities in bulk for optimal performance.
|
|
63
|
+
[1.0.9] 07/15/2024 Satya Phanindra K. Error handling for empty list outputs from extract_raw function
|
|
64
|
+
[1.0.10] 11/24/2024 Prudhvi Chekuri Added support for skills extraction from syllabi data
|
|
65
|
+
[1.0.11] 03/12/2025 Satya Phanindra K. Update extractor function to handle syllabus data
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
TODO:
|
|
69
|
+
-----
|
|
70
|
+
- 1: Add references to utils and global parameter file
|
|
71
|
+
- 2: sort taxonomy inputs
|
|
72
|
+
- 3: include rsd_name instead of keywords from osn
|
|
73
|
+
- 4: Optimize the `align_skills` function.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
# native packages
|
|
77
|
+
import sys
|
|
78
|
+
import os
|
|
79
|
+
|
|
80
|
+
# installed packages
|
|
81
|
+
import spacy
|
|
82
|
+
import torch
|
|
83
|
+
import numpy as np
|
|
84
|
+
import pandas as pd
|
|
85
|
+
from vllm import LLM
|
|
86
|
+
from tqdm.auto import tqdm
|
|
87
|
+
from spacy.matcher import PhraseMatcher
|
|
88
|
+
from skillNer.general_params import SKILL_DB
|
|
89
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
90
|
+
from skillNer.skill_extractor_class import SkillExtractor
|
|
91
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
92
|
+
from scipy.spatial.distance import cdist
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# internal packages
|
|
96
|
+
from laiser.utils import get_embedding, cosine_similarity
|
|
97
|
+
from laiser.params import SIMILARITY_THRESHOLD, SKILL_DB_PATH
|
|
98
|
+
from laiser.llm_methods import get_completion, get_completion_vllm
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class Skill_Extractor:
|
|
102
|
+
"""
|
|
103
|
+
Class to extract skills from text and align them to existing taxonomy
|
|
104
|
+
...
|
|
105
|
+
|
|
106
|
+
Attributes
|
|
107
|
+
----------
|
|
108
|
+
client : HuggingFace API client
|
|
109
|
+
nlp : spacy nlp model
|
|
110
|
+
|
|
111
|
+
Methods
|
|
112
|
+
-------
|
|
113
|
+
extract_raw(input_text: text)
|
|
114
|
+
The function extracts skills from text using NER model
|
|
115
|
+
|
|
116
|
+
align_skills(raw_skills: list, document_id='0': string):
|
|
117
|
+
This function aligns the skills provided to the desired taxonomy
|
|
118
|
+
|
|
119
|
+
align_KSAs(extracted_df: pandas dataframe, id_column='Research ID'):
|
|
120
|
+
This function aligns the skills provided to the desired taxonomy
|
|
121
|
+
|
|
122
|
+
extractor(data: pandas dataframe, id_column='Research ID', text_column='Text'):
|
|
123
|
+
Function takes text dataset to extract and aligns skills based on available taxonomies
|
|
124
|
+
....
|
|
125
|
+
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, AI_MODEL_ID, HF_TOKEN, use_gpu):
|
|
129
|
+
self.model_id = AI_MODEL_ID
|
|
130
|
+
self.HF_TOKEN=HF_TOKEN
|
|
131
|
+
self.use_gpu=use_gpu
|
|
132
|
+
self.nlp = spacy.load("en_core_web_lg")
|
|
133
|
+
self.skill_db_df = pd.read_csv(SKILL_DB_PATH)
|
|
134
|
+
self.skill_db_embeddings = np.array([get_embedding(self.nlp, label) for label in self.skill_db_df['SkillLabel']])
|
|
135
|
+
if torch.cuda.is_available() and self.use_gpu:
|
|
136
|
+
print("GPU is available. Using GPU for Large Language model initialization...")
|
|
137
|
+
torch.cuda.empty_cache()
|
|
138
|
+
|
|
139
|
+
# Use quantization to reduce the model size and memory usage
|
|
140
|
+
if self.model_id:
|
|
141
|
+
self.llm = LLM(model=self.model_id, dtype="float16", quantization='gptq')
|
|
142
|
+
else:
|
|
143
|
+
self.llm = LLM(model="marcsun13/gemma-2-9b-it-GPTQ", dtype="float16", quantization='gptq')
|
|
144
|
+
else:
|
|
145
|
+
print("GPU is not available. Using CPU for SkillNer model initialization...")
|
|
146
|
+
self.ner_extractor = SkillExtractor(self.nlp, SKILL_DB, PhraseMatcher)
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# Declaring a private method for extracting raw skills from input text
|
|
151
|
+
def extract_raw(self, input_text, text_columns, id_column, input_type):
|
|
152
|
+
"""
|
|
153
|
+
The function extracts skills from text using Fine-Tuned Language Model's API
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
input_text : pandas Series with text data
|
|
158
|
+
Job advertisement / Job Description / Syllabus Description / Course Outcomes etc.
|
|
159
|
+
text_columns: list
|
|
160
|
+
Name of the text columns in the dataset. Defaults to 'description'
|
|
161
|
+
input_type: string
|
|
162
|
+
Type of input data. Defaults to 'job_desc'
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
list: List of extracted skills from text
|
|
167
|
+
|
|
168
|
+
Notes
|
|
169
|
+
-----
|
|
170
|
+
More details on which (pre-trained) language model is fine-tuned can be found in llm_methods.py
|
|
171
|
+
The Function is designed only to return list of skills based on prompt passed to OpenAI's Fine-tuned model.
|
|
172
|
+
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
if torch.cuda.is_available() and self.use_gpu:
|
|
176
|
+
# GPU is available. Using Language model for extraction.
|
|
177
|
+
extracted_skills_set = get_completion_vllm(input_text, text_columns, id_column, input_type, self.llm)
|
|
178
|
+
torch.cuda.empty_cache()
|
|
179
|
+
else:
|
|
180
|
+
# GPU is not available. Using SkillNer model for extraction.
|
|
181
|
+
ner_extractor = self.ner_extractor
|
|
182
|
+
extracted_skills_set = set()
|
|
183
|
+
annotations = None
|
|
184
|
+
try:
|
|
185
|
+
|
|
186
|
+
if input_type == "job_desc":
|
|
187
|
+
input_text = input_text[text_columns][0]
|
|
188
|
+
elif input_type == "syllabus":
|
|
189
|
+
input_text = f"Course Description: {input_text[text_columns[0]]}\n\nLearning Outcomes: {input_text[text_columns[1]]}"
|
|
190
|
+
|
|
191
|
+
annotations = ner_extractor.annotate(input_text)
|
|
192
|
+
except ValueError as e:
|
|
193
|
+
print(f"Skipping example, ValueError encountered: {e}")
|
|
194
|
+
except Exception as e:
|
|
195
|
+
print(f"Skipping example, An unexpected error occurred: {e}")
|
|
196
|
+
|
|
197
|
+
for item in annotations['results']['full_matches']:
|
|
198
|
+
extracted_skills_set.add(item['doc_node_value'])
|
|
199
|
+
|
|
200
|
+
# get ngram_scored
|
|
201
|
+
for item in annotations['results']['ngram_scored']:
|
|
202
|
+
extracted_skills_set.add(item['doc_node_value'])
|
|
203
|
+
|
|
204
|
+
return list(extracted_skills_set)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def align_skills(self, raw_skills, document_id='0'):
|
|
208
|
+
"""
|
|
209
|
+
This function aligns the skills provided to the available taxonomy
|
|
210
|
+
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
raw_skills : list
|
|
214
|
+
Provide list of skill extracted from Job Descriptions / Syllabus.
|
|
215
|
+
|
|
216
|
+
Returns
|
|
217
|
+
-------
|
|
218
|
+
list: List of taxonomy skills from text in JSON format
|
|
219
|
+
[
|
|
220
|
+
{
|
|
221
|
+
"Research ID": text_id,
|
|
222
|
+
"Skill Name": Raw skill extracted,
|
|
223
|
+
"Skill Tag": taxonomy skill tag,
|
|
224
|
+
"Correlation Coefficient": similarity_score
|
|
225
|
+
},
|
|
226
|
+
...
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
"""
|
|
230
|
+
raw_skill_embeddings = np.array([get_embedding(self.nlp, skill) for skill in raw_skills])
|
|
231
|
+
|
|
232
|
+
# Calculate cosine similarities in bulk
|
|
233
|
+
similarities = 1 - cdist(raw_skill_embeddings, self.skill_db_embeddings, metric='cosine')
|
|
234
|
+
|
|
235
|
+
matches = []
|
|
236
|
+
for i, raw_skill in enumerate(raw_skills):
|
|
237
|
+
skill_matches = np.where(similarities[i] > SIMILARITY_THRESHOLD)[0]
|
|
238
|
+
for match in skill_matches:
|
|
239
|
+
matches.append({
|
|
240
|
+
"Research ID": document_id,
|
|
241
|
+
"Raw Skill": raw_skill,
|
|
242
|
+
"Skill Tag": self.skill_db_df.iloc[match]['SkillTag'],
|
|
243
|
+
"Correlation Coefficient": similarities[i, match]
|
|
244
|
+
})
|
|
245
|
+
|
|
246
|
+
return matches
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def align_KSAs(self, extracted_df, id_column):
|
|
250
|
+
"""
|
|
251
|
+
This function aligns the skills provided to the available taxonomy
|
|
252
|
+
|
|
253
|
+
Parameters
|
|
254
|
+
----------
|
|
255
|
+
extracted_df : pandas dataframe
|
|
256
|
+
Provide dataframe of skills extracted from Job Descriptions / Syllabus.
|
|
257
|
+
id_column: string
|
|
258
|
+
Name of id column in the dataset. Defaults to 'Research ID'
|
|
259
|
+
|
|
260
|
+
Returns
|
|
261
|
+
-------
|
|
262
|
+
list: List of taxonomy skills from text in JSON format
|
|
263
|
+
|
|
264
|
+
[
|
|
265
|
+
{
|
|
266
|
+
"Research ID": text_id,
|
|
267
|
+
"Description": String, (optional)
|
|
268
|
+
"Learning Outcomes": List of Strings, (optional)
|
|
269
|
+
"Raw Skill": String,
|
|
270
|
+
"Level": int, (optional)
|
|
271
|
+
"Knowledge Required": String, (optional)
|
|
272
|
+
"Task Abilities": String, (optional)
|
|
273
|
+
"Skill Tag": String,
|
|
274
|
+
"Correlation Coefficient": float
|
|
275
|
+
},
|
|
276
|
+
...
|
|
277
|
+
]
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
matches = []
|
|
281
|
+
|
|
282
|
+
raw_skill_embeddings = np.array([get_embedding(self.nlp, skill) for skill in extracted_df['Skill']])
|
|
283
|
+
|
|
284
|
+
# Calculate cosine similarities in bulk
|
|
285
|
+
similarities = 1 - cdist(raw_skill_embeddings, self.skill_db_embeddings, metric='cosine')
|
|
286
|
+
|
|
287
|
+
for i, raw_skill in tqdm(enumerate(extracted_df['Skill'])):
|
|
288
|
+
skill_matches = np.where(similarities[i] > SIMILARITY_THRESHOLD)[0]
|
|
289
|
+
for match in skill_matches:
|
|
290
|
+
matches.append({
|
|
291
|
+
"Research ID": extracted_df.iloc[i][id_column],
|
|
292
|
+
"Description": extracted_df.iloc[i]['description'],
|
|
293
|
+
"Learning Outcomes": extracted_df.iloc[i]['learning_outcomes'],
|
|
294
|
+
"Raw Skill": raw_skill,
|
|
295
|
+
"Level": extracted_df.iloc[i]['Level'],
|
|
296
|
+
"Knowledge Required": extracted_df.iloc[i]['Knowledge Required'],
|
|
297
|
+
"Task Abilities": extracted_df.iloc[i]['Task Abilities'],
|
|
298
|
+
"Skill Tag": f"ESCO.{match}",
|
|
299
|
+
"Correlation Coefficient": similarities[i, match]
|
|
300
|
+
})
|
|
301
|
+
|
|
302
|
+
return matches
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def extractor(self, data, id_column='Research ID', text_columns=["description"], input_type="job_desc"):
|
|
306
|
+
"""
|
|
307
|
+
Function takes text dataset to extract and aligns skills based on available taxonomies
|
|
308
|
+
|
|
309
|
+
Parameters
|
|
310
|
+
----------
|
|
311
|
+
data : pandas dataframe
|
|
312
|
+
Dataset containing text id and actual text to extract skills.
|
|
313
|
+
id_column: string
|
|
314
|
+
Name of id column in the dataset. Defaults to 'Research ID'
|
|
315
|
+
text_columns: list
|
|
316
|
+
Name of the text columns in the dataset. Defaults to 'description'
|
|
317
|
+
input_type: string
|
|
318
|
+
Type of input data. Defaults to 'job_desc'
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
list: List of skill tags and similarity_score for all texts in from text in JSON format
|
|
323
|
+
[
|
|
324
|
+
{
|
|
325
|
+
"Research ID": text_id,
|
|
326
|
+
"Skill Tag": String,
|
|
327
|
+
"Description": String, (optional)
|
|
328
|
+
"Learning Outcomes": List of Strings, (optional)
|
|
329
|
+
"Raw Skill": String,
|
|
330
|
+
"Level": int, (optional)
|
|
331
|
+
"Knowledge Required": String, (optional)
|
|
332
|
+
"Task Abilities": String, (optional)
|
|
333
|
+
"Correlation Coefficient": float
|
|
334
|
+
},
|
|
335
|
+
...
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
"""
|
|
339
|
+
|
|
340
|
+
if torch.cuda.is_available() and self.use_gpu:
|
|
341
|
+
KSAs = self.extract_raw(data, text_columns, id_column, input_type)
|
|
342
|
+
|
|
343
|
+
extracted_df = pd.DataFrame(KSAs)
|
|
344
|
+
if input_type != 'syllabus':
|
|
345
|
+
extracted_df["learning_outcomes"] = ''
|
|
346
|
+
|
|
347
|
+
extracted_df = extracted_df[[id_column, "description", "learning_outcomes", "Skill", "Level", "Knowledge Required", "Task Abilities"]]
|
|
348
|
+
matches = self.align_KSAs(extracted_df, id_column)
|
|
349
|
+
|
|
350
|
+
extracted = pd.DataFrame(columns=['Research ID', 'Description', 'Learning Outcomes', 'Raw Skill', 'Level', 'Knowledge Required', 'Task Abilities', 'Skill Tag', 'Correlation Coefficient'])
|
|
351
|
+
extracted = extracted._append(matches, ignore_index=True)
|
|
352
|
+
|
|
353
|
+
if input_type != "syllabus":
|
|
354
|
+
extracted.drop("Learning Outcomes", axis=1, inplace=True)
|
|
355
|
+
|
|
356
|
+
else:
|
|
357
|
+
extracted = pd.DataFrame(columns=['Research ID', 'Raw Skill', 'Skill Tag', 'Correlation Coefficient'])
|
|
358
|
+
for _, row in data.iterrows():
|
|
359
|
+
research_id = row[id_column]
|
|
360
|
+
raw_skills = self.extract_raw(row, text_columns, id_column, input_type)
|
|
361
|
+
if(len(raw_skills) == 0):
|
|
362
|
+
continue
|
|
363
|
+
else:
|
|
364
|
+
aligned_skills = self.align_skills(raw_skills, research_id)
|
|
365
|
+
extracted = extracted._append(aligned_skills, ignore_index=True)
|
|
366
|
+
|
|
367
|
+
return extracted
|
laiser/utils.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module Description:
|
|
3
|
+
-------------------
|
|
4
|
+
A Class with utility functions
|
|
5
|
+
|
|
6
|
+
Ownership:
|
|
7
|
+
----------
|
|
8
|
+
Project: Leveraging Artificial intelligence for Skills Extraction and Research (LAiSER)
|
|
9
|
+
Owner: George Washington University Institute of Public Policy
|
|
10
|
+
Program on Skills, Credentials and Workforce Policy
|
|
11
|
+
Media and Public Affairs Building
|
|
12
|
+
805 21st Street NW
|
|
13
|
+
Washington, DC 20052
|
|
14
|
+
PSCWP@gwu.edu
|
|
15
|
+
https://gwipp.gwu.edu/program-skills-credentials-workforce-policy-pscwp
|
|
16
|
+
|
|
17
|
+
License:
|
|
18
|
+
--------
|
|
19
|
+
Copyright 2024 George Washington University Institute of Public Policy
|
|
20
|
+
|
|
21
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
|
22
|
+
documentation files (the “Software”), to deal in the Software without restriction, including without limitation
|
|
23
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
24
|
+
and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
25
|
+
|
|
26
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
|
|
27
|
+
Software.
|
|
28
|
+
|
|
29
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
30
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
31
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
32
|
+
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
"""
|
|
37
|
+
Revision History:
|
|
38
|
+
-----------------
|
|
39
|
+
Rev No. Date Author Description
|
|
40
|
+
[1.0.0] 06/01/2024 Vedant M. Initial Version
|
|
41
|
+
[1.0.1] 06/10/2024 Vedant M. added logging function
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
TODO:
|
|
45
|
+
-----
|
|
46
|
+
- 1:
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
import numpy as np
|
|
50
|
+
import psutil
|
|
51
|
+
import logging
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def cosine_similarity(vec1, vec2):
|
|
55
|
+
"""
|
|
56
|
+
Calculates cosine similarity between 2 vectors
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
vec1, vec2 : numpy array of vectorized text
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
numeric value
|
|
65
|
+
"""
|
|
66
|
+
product_of_magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
|
|
67
|
+
if product_of_magnitude == 0.0:
|
|
68
|
+
return 0.0
|
|
69
|
+
return np.dot(vec1, vec2) / product_of_magnitude
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_embedding(nlp, input_text):
|
|
73
|
+
"""
|
|
74
|
+
Creates vector embeddings for input text based on nlp object
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
nlp : object of spacy nlp model
|
|
79
|
+
input_text : text
|
|
80
|
+
Provide text to be vectorized, usually skill, extracted of referenced
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
numpy array of vectorized text
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
"""
|
|
88
|
+
doc = nlp(input_text)
|
|
89
|
+
if len(doc) == 0:
|
|
90
|
+
return np.zeros(300) # Return zeros for empty texts
|
|
91
|
+
return np.mean([word.vector for word in doc], axis=0)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def log_performance(function_name, start_time, end_time):
|
|
95
|
+
"""
|
|
96
|
+
Utility function to log performance in unit of time for a function
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
function_name : text
|
|
101
|
+
Name of the function
|
|
102
|
+
start_time : time
|
|
103
|
+
execution start time of the function
|
|
104
|
+
end_time : time
|
|
105
|
+
execution end time of the function
|
|
106
|
+
|
|
107
|
+
"""
|
|
108
|
+
execution_time = end_time - start_time
|
|
109
|
+
process = psutil.Process()
|
|
110
|
+
cpu_percent = process.cpu_percent()
|
|
111
|
+
memory_info = process.memory_info()
|
|
112
|
+
memory_usage = memory_info.rss / (1024 ** 2) # Convert to MB
|
|
113
|
+
|
|
114
|
+
log_message = (
|
|
115
|
+
f"Function: {function_name}\n"
|
|
116
|
+
f"Execution time: {execution_time:.2f} seconds\n"
|
|
117
|
+
f"CPU usage: {cpu_percent:.2f}%\n"
|
|
118
|
+
f"Memory usage: {memory_usage:.2f} MB\n"
|
|
119
|
+
"-------------------------------"
|
|
120
|
+
)
|
|
121
|
+
logging.info(log_message)
|
|
122
|
+
print(log_message)
|