themefinder 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of themefinder might be problematic. Click here for more details.

@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 i.AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,173 @@
1
+ Metadata-Version: 2.3
2
+ Name: themefinder
3
+ Version: 0.0.3
4
+ Summary: A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses.
5
+ License: MIT
6
+ Author: i.AI
7
+ Author-email: packages@cabinetoffice.gov.uk
8
+ Requires-Python: >=3.10,<3.13
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Text Processing :: Linguistic
18
+ Requires-Dist: boto3 (>=1.29,<2.0)
19
+ Requires-Dist: langchain
20
+ Requires-Dist: langchain-openai (==0.1.17)
21
+ Requires-Dist: langfuse (==2.29.1)
22
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
23
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
24
+ Requires-Dist: pyarrow (>=15.0.0,<16.0.0)
25
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
26
+ Requires-Dist: scikit-learn
27
+ Requires-Dist: toml (>=0.10.2,<0.11.0)
28
+ Project-URL: Documentation, https://i-dot-ai.github.io/themefinder/
29
+ Project-URL: Repository, https://github.com/i-dot-ai/themefinder/
30
+ Description-Content-Type: text/markdown
31
+
32
+ # ThemeFinder
33
+
34
+ ThemeFinder is a topic modelling Python package designed for analysing one-to-many question-answer data (i.e. survey responses, public consultations, etc.). See the [docs](https://i-dot-ai.github.io/themefinder/) for more info.
35
+
36
+ > [!IMPORTANT]
37
+ > Incubation project: This project is an incubation project; as such, we don't recommend using this for critical use cases yet. We are currently in a research stage, trialling the tool for case studies across the Civil Service. Find out more about our projects at https://ai.gov.uk/.
38
+
39
+
40
+ ## Quickstart
41
+
42
+ ### Install using your package manager of choice
43
+
44
+ For example `pip install themefinder` or `poetry add themefinder`.
45
+
46
+ ### Usage
47
+
48
+ ThemeFinder takes as input a [pandas DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) with two columns:
49
+ - `response_id`: A unique identifier for each response
50
+ - `response`: The free text survey response
51
+
52
+ ThemeFinder now supports a range of language models through structured outputs.
53
+
54
+ The function `find_themes` identifies common themes in responses and labels them, it also outputs results from intermediate steps in the theme finding pipeline.
55
+
56
+ For this example, import the following Python packages into your virtual environment: `asyncio`, `pandas`, `lanchain`. And import `themefinder` as described above.
57
+
58
+ If you are using environment variables (eg for API keys), you can use `python-dotenv` to read variables from a `.env` file.
59
+
60
+ If you are using an Azure OpenAI endpoint, you will need the following variables:
61
+
62
+ - `AZURE_OPENAI_API_KEY`
63
+ - `AZURE_OPENAI_ENDPOINT`
64
+ - `OPENAI_API_VERSION`
65
+ - `DEPLOYMENT_NAME`
66
+ - `AZURE_OPENAI_BASE_URL`
67
+
68
+ Otherwise you will need whichever variables [LangChain](https://www.langchain.com/) requires for your LLM of choice.
69
+
70
+ ```python
71
+ import asyncio
72
+ from dotenv import load_dotenv
73
+ import pandas as pd
74
+ from langchain_openai import AzureChatOpenAI
75
+ from themefinder import find_themes
76
+
77
+ # If needed, load LLM API settings from .env file
78
+ load_dotenv()
79
+
80
+ # Initialise your LLM of choice using langchain
81
+ llm = AzureChatOpenAI(
82
+ model="gpt-4o",
83
+ temperature=0,
84
+ )
85
+
86
+ # Set up your data
87
+ responses_df = pd.DataFrame({
88
+ "response_id": ["1", "2", "3", "4", "5"],
89
+ "response": ["I think it's awesome, I can use it for consultation analysis.",
90
+ "It's great.", "It's a good approach to topic modelling.", "I'm not sure, I need to trial it more.", "I don't like it so much."]
91
+ })
92
+
93
+ # Add your question
94
+ question = "What do you think of ThemeFinder?"
95
+
96
+ # Make the system prompt specific to your use case
97
+ system_prompt = "You are an AI evaluation tool analyzing survey responses about a Python package."
98
+
99
+ # Run the function to find themes, we use asyncio to query LLM endpoints asynchronously, so we need to await our function
100
+ async def main():
101
+ result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
102
+ print(result)
103
+
104
+ if __name__ == "__main__":
105
+ asyncio.run(main())
106
+ ```
107
+
108
+ ## ThemeFinder pipeline
109
+
110
+ ThemeFinder's pipeline consists of five distinct stages, each utilizing a specialized LLM prompt:
111
+
112
+ ### Sentiment analysis
113
+ - Analyses the emotional tone and position of each response using sentiment-focused prompts
114
+ - Provides structured sentiment categorisation based on LLM analysis
115
+
116
+ ### Theme generation
117
+ - Uses exploratory prompts to identify initial themes from response batches
118
+ - Groups related responses for better context through guided theme extraction
119
+
120
+ ### Theme condensation
121
+ - Employs comparative prompts to combine similar or overlapping themes
122
+ - Reduces redundancy in identified topics through systematic theme evaluation
123
+
124
+ ### Theme refinement
125
+ - Leverages standardisation prompts to normalise theme descriptions
126
+ - Creates clear, consistent theme definitions through structured refinement
127
+
128
+ ### Theme target alignment
129
+ - Optional step to consolidate themes down to a target number
130
+
131
+ ### Theme mapping
132
+ - Utilizes classification prompts to map individual responses to refined themes
133
+ - Supports multiple theme assignments per response through detailed analysis
134
+
135
+
136
+ The prompts used at each stage can be found in `src/themefinder/prompts/`.
137
+
138
+ The file `src/themefinder.core.py` contains the function `find_themes` which runs the pipline. It also contains functions fo each individual stage.
139
+
140
+
141
+ **For more detail - see the docs: [https://i-dot-ai.github.io/themefinder/](https://i-dot-ai.github.io/themefinder/).**
142
+
143
+
144
+ ## Model Compatibility
145
+
146
+ ThemeFinder's structured output approach makes it compatible with a wide range of language models from various providers. This list is non-exhaustive, and other models may also work effectively:
147
+
148
+ ### OpenAI Models
149
+ - GPT-4, GPT-4o, GPT-4.1
150
+ - All Azure OpenAI deployments
151
+
152
+ ### Google Models
153
+ - Gemini series (1.5 Pro, 2.0 Pro, etc.)
154
+
155
+ ### Anthropic Models
156
+ - Claude series (Claude 3 Opus, Sonnet, Haiku, etc.)
157
+
158
+ ### Open Source Models
159
+ - Llama 2, Llama 3
160
+ - Mistral models (e.g., Mistral 7B, Mixtral)
161
+
162
+
163
+ ## License
164
+
165
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
166
+
167
+ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/) and available under the terms of the [Open Government 3.0 licence](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).
168
+
169
+
170
+ ## Feedback
171
+
172
+ If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
173
+
@@ -0,0 +1,141 @@
1
+ # ThemeFinder
2
+
3
+ ThemeFinder is a topic modelling Python package designed for analysing one-to-many question-answer data (i.e. survey responses, public consultations, etc.). See the [docs](https://i-dot-ai.github.io/themefinder/) for more info.
4
+
5
+ > [!IMPORTANT]
6
+ > Incubation project: This project is an incubation project; as such, we don't recommend using this for critical use cases yet. We are currently in a research stage, trialling the tool for case studies across the Civil Service. Find out more about our projects at https://ai.gov.uk/.
7
+
8
+
9
+ ## Quickstart
10
+
11
+ ### Install using your package manager of choice
12
+
13
+ For example `pip install themefinder` or `poetry add themefinder`.
14
+
15
+ ### Usage
16
+
17
+ ThemeFinder takes as input a [pandas DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) with two columns:
18
+ - `response_id`: A unique identifier for each response
19
+ - `response`: The free text survey response
20
+
21
+ ThemeFinder now supports a range of language models through structured outputs.
22
+
23
+ The function `find_themes` identifies common themes in responses and labels them, it also outputs results from intermediate steps in the theme finding pipeline.
24
+
25
+ For this example, import the following Python packages into your virtual environment: `asyncio`, `pandas`, `lanchain`. And import `themefinder` as described above.
26
+
27
+ If you are using environment variables (eg for API keys), you can use `python-dotenv` to read variables from a `.env` file.
28
+
29
+ If you are using an Azure OpenAI endpoint, you will need the following variables:
30
+
31
+ - `AZURE_OPENAI_API_KEY`
32
+ - `AZURE_OPENAI_ENDPOINT`
33
+ - `OPENAI_API_VERSION`
34
+ - `DEPLOYMENT_NAME`
35
+ - `AZURE_OPENAI_BASE_URL`
36
+
37
+ Otherwise you will need whichever variables [LangChain](https://www.langchain.com/) requires for your LLM of choice.
38
+
39
+ ```python
40
+ import asyncio
41
+ from dotenv import load_dotenv
42
+ import pandas as pd
43
+ from langchain_openai import AzureChatOpenAI
44
+ from themefinder import find_themes
45
+
46
+ # If needed, load LLM API settings from .env file
47
+ load_dotenv()
48
+
49
+ # Initialise your LLM of choice using langchain
50
+ llm = AzureChatOpenAI(
51
+ model="gpt-4o",
52
+ temperature=0,
53
+ )
54
+
55
+ # Set up your data
56
+ responses_df = pd.DataFrame({
57
+ "response_id": ["1", "2", "3", "4", "5"],
58
+ "response": ["I think it's awesome, I can use it for consultation analysis.",
59
+ "It's great.", "It's a good approach to topic modelling.", "I'm not sure, I need to trial it more.", "I don't like it so much."]
60
+ })
61
+
62
+ # Add your question
63
+ question = "What do you think of ThemeFinder?"
64
+
65
+ # Make the system prompt specific to your use case
66
+ system_prompt = "You are an AI evaluation tool analyzing survey responses about a Python package."
67
+
68
+ # Run the function to find themes, we use asyncio to query LLM endpoints asynchronously, so we need to await our function
69
+ async def main():
70
+ result = await find_themes(responses_df, llm, question, system_prompt=system_prompt)
71
+ print(result)
72
+
73
+ if __name__ == "__main__":
74
+ asyncio.run(main())
75
+ ```
76
+
77
+ ## ThemeFinder pipeline
78
+
79
+ ThemeFinder's pipeline consists of five distinct stages, each utilizing a specialized LLM prompt:
80
+
81
+ ### Sentiment analysis
82
+ - Analyses the emotional tone and position of each response using sentiment-focused prompts
83
+ - Provides structured sentiment categorisation based on LLM analysis
84
+
85
+ ### Theme generation
86
+ - Uses exploratory prompts to identify initial themes from response batches
87
+ - Groups related responses for better context through guided theme extraction
88
+
89
+ ### Theme condensation
90
+ - Employs comparative prompts to combine similar or overlapping themes
91
+ - Reduces redundancy in identified topics through systematic theme evaluation
92
+
93
+ ### Theme refinement
94
+ - Leverages standardisation prompts to normalise theme descriptions
95
+ - Creates clear, consistent theme definitions through structured refinement
96
+
97
+ ### Theme target alignment
98
+ - Optional step to consolidate themes down to a target number
99
+
100
+ ### Theme mapping
101
+ - Utilizes classification prompts to map individual responses to refined themes
102
+ - Supports multiple theme assignments per response through detailed analysis
103
+
104
+
105
+ The prompts used at each stage can be found in `src/themefinder/prompts/`.
106
+
107
+ The file `src/themefinder.core.py` contains the function `find_themes` which runs the pipline. It also contains functions fo each individual stage.
108
+
109
+
110
+ **For more detail - see the docs: [https://i-dot-ai.github.io/themefinder/](https://i-dot-ai.github.io/themefinder/).**
111
+
112
+
113
+ ## Model Compatibility
114
+
115
+ ThemeFinder's structured output approach makes it compatible with a wide range of language models from various providers. This list is non-exhaustive, and other models may also work effectively:
116
+
117
+ ### OpenAI Models
118
+ - GPT-4, GPT-4o, GPT-4.1
119
+ - All Azure OpenAI deployments
120
+
121
+ ### Google Models
122
+ - Gemini series (1.5 Pro, 2.0 Pro, etc.)
123
+
124
+ ### Anthropic Models
125
+ - Claude series (Claude 3 Opus, Sonnet, Haiku, etc.)
126
+
127
+ ### Open Source Models
128
+ - Llama 2, Llama 3
129
+ - Mistral models (e.g., Mistral 7B, Mixtral)
130
+
131
+
132
+ ## License
133
+
134
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
135
+
136
+ The documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/) and available under the terms of the [Open Government 3.0 licence](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).
137
+
138
+
139
+ ## Feedback
140
+
141
+ If you have feedback on this package, please fill in our [feedback form](https://forms.gle/85xUSMvxGzSSKQ499) or contact us with questions or feedback at packages@cabinetoffice.gov.uk.
@@ -0,0 +1,49 @@
1
+ [tool.poetry]
2
+ name = "themefinder"
3
+ version = "0.0.3"
4
+ description = "A topic modelling Python package designed for analysing one-to-many question-answer data eg free-text survey responses."
5
+ authors = ["i.AI <packages@cabinetoffice.gov.uk>"]
6
+ packages = [{include = "themefinder", from = "src"}]
7
+ readme = "README.md"
8
+ license = "MIT"
9
+ repository = "https://github.com/i-dot-ai/themefinder/"
10
+ documentation = "https://i-dot-ai.github.io/themefinder/"
11
+ classifiers = [
12
+ "Intended Audience :: Developers",
13
+ "Intended Audience :: Science/Research",
14
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
15
+ "Topic :: Text Processing :: Linguistic",
16
+ ]
17
+
18
+
19
+ [tool.poetry.dependencies]
20
+ python = ">=3.10,<3.13"
21
+ langchain = "*"
22
+ langchain-openai = "0.1.17"
23
+ pandas = "^2.2.2"
24
+ python-dotenv = "^1.0.1"
25
+ langfuse = "2.29.1"
26
+ boto3 = "^1.29"
27
+ scikit-learn = "*"
28
+ openpyxl = "^3.1.5"
29
+ pyarrow = "^15.0.0"
30
+ toml = "^0.10.2"
31
+
32
+ [tool.poetry.group.dev.dependencies]
33
+ pytest = "*"
34
+ pytest-asyncio = "^0.24.0"
35
+ coverage = "^7.6.10"
36
+
37
+ [tool.poetry.group.docs.dependencies]
38
+ mkdocs = "^1.6.1"
39
+ mkdocstrings = {extras = ["python"], version = "^0.27.0"}
40
+ mkdocs-material = "^9.5.50"
41
+
42
+ [tool.pytest.ini_options]
43
+ pythonpath = "."
44
+ asyncio_mode = "auto"
45
+ asyncio_default_fixture_loop_scope = "function"
46
+
47
+ [build-system]
48
+ requires = ["poetry-core>=1.0.0"]
49
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,22 @@
1
+ from .core import (
2
+ find_themes,
3
+ sentiment_analysis,
4
+ theme_condensation,
5
+ theme_generation,
6
+ theme_mapping,
7
+ theme_refinement,
8
+ theme_target_alignment,
9
+ detail_detection,
10
+ )
11
+
12
+ __all__ = [
13
+ "find_themes",
14
+ "sentiment_analysis",
15
+ "theme_generation",
16
+ "theme_condensation",
17
+ "theme_refinement",
18
+ "theme_target_alignment",
19
+ "theme_mapping",
20
+ "detail_detection",
21
+ ]
22
+ __version__ = "0.1.0"