rag-citation 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rag_citation-0.0.1/LICENSE +21 -0
- rag_citation-0.0.1/PKG-INFO +205 -0
- rag_citation-0.0.1/README.md +191 -0
- rag_citation-0.0.1/rag_citation/__init__.py +4 -0
- rag_citation-0.0.1/rag_citation/base_model/__init__.py +4 -0
- rag_citation-0.0.1/rag_citation/base_model/embedding_model.py +49 -0
- rag_citation-0.0.1/rag_citation/base_model/spacy_model.py +41 -0
- rag_citation-0.0.1/rag_citation/cite_item.py +73 -0
- rag_citation-0.0.1/rag_citation/embedding/__init__.py +3 -0
- rag_citation-0.0.1/rag_citation/embedding/embedding_model.py +33 -0
- rag_citation-0.0.1/rag_citation/focus_word/__init__.py +4 -0
- rag_citation-0.0.1/rag_citation/focus_word/focus_word.py +53 -0
- rag_citation-0.0.1/rag_citation/focus_word/schema.py +19 -0
- rag_citation-0.0.1/rag_citation/inference.py +293 -0
- rag_citation-0.0.1/rag_citation/pair/__init__.py +3 -0
- rag_citation-0.0.1/rag_citation/pair/focus_word_in_cite_data.py +134 -0
- rag_citation-0.0.1/rag_citation/pair/generate_pair.py +131 -0
- rag_citation-0.0.1/rag_citation/pair/schema.py +28 -0
- rag_citation-0.0.1/rag_citation/schema.py +18 -0
- rag_citation-0.0.1/rag_citation/score/__init__.py +3 -0
- rag_citation-0.0.1/rag_citation/score/score.py +23 -0
- rag_citation-0.0.1/rag_citation.egg-info/PKG-INFO +205 -0
- rag_citation-0.0.1/rag_citation.egg-info/SOURCES.txt +27 -0
- rag_citation-0.0.1/rag_citation.egg-info/dependency_links.txt +1 -0
- rag_citation-0.0.1/rag_citation.egg-info/requires.txt +2 -0
- rag_citation-0.0.1/rag_citation.egg-info/top_level.txt +1 -0
- rag_citation-0.0.1/setup.cfg +4 -0
- rag_citation-0.0.1/setup.py +18 -0
- rag_citation-0.0.1/test/test.py +52 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Rahul Anand
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: rag_citation
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: RAG Citation is an project that combines Retrieval-Augmented Generation (RAG) with automatic citation generation. This tool is designed to enhance the credibility of AI-generated content by providing relevant citations for the information used in generating responses.
|
|
5
|
+
Home-page: https://github.com/rahulanand1103/rag-citation
|
|
6
|
+
Author: rahul anand
|
|
7
|
+
Author-email: rahulanand1103@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: spacy==3.7.5
|
|
13
|
+
Requires-Dist: sentence_transformers==3.0.1
|
|
14
|
+
|
|
15
|
+
# RAG Citation: Enhancing AI-Generated Content with Automatic Citations (A Non-LLM Approach)
|
|
16
|
+
|
|
17
|
+
## Project Overview
|
|
18
|
+
|
|
19
|
+
RAG Citation is an project that combines Retrieval-Augmented Generation (RAG) with automatic citation generation. This tool is designed to enhance the credibility of AI-generated content by providing relevant citations for the information used in generating responses.
|
|
20
|
+
|
|
21
|
+
## Key Features
|
|
22
|
+
|
|
23
|
+
- **Non-LLM Approach:** Utilizes efficient algorithms and NLP techniques for citation generation, making it fast and lightweight.
|
|
24
|
+
- **Semantic Search:** Identifies relevant source documents based on meaning and context rather than just keyword matching.
|
|
25
|
+
- **Named Entity Recognition:** Extracts and returns relevant named entities from LLM-generated answers, such as people, organizations, money and dates.
|
|
26
|
+
- **Flexible Integration:** Can be easily integrated into rag pipeline.
|
|
27
|
+
- **Hallucination (Beta)** This beta feature identifies instances where the LLM-generated answer contains entities like ["DATE", "MONEY", "CARDINAL", "ORDINAL", "QUANTITY", "TIME"], but these entities cannot be found within the context. If such a mismatch occurs, it flags the result as a potential hallucination.
|
|
28
|
+
|
|
29
|
+
## Quickstart
|
|
30
|
+
* <b>Langchain example</b>: [langchain.ipynb](https://github.com/rahulanand1103/rag-citation/blob/main/docs/examples/3.example-langchain.ipynb)
|
|
31
|
+
* <b>Embeddchain example</b>: [embeddchain.ipynb](https://github.com/rahulanand1103/rag-citation/blob/main/docs/examples/2.example-embeddchain.ipynb)
|
|
32
|
+
|
|
33
|
+
To get started with `rag-citation`, install it using pip:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install rag-citation
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Here's a basic example demonstrating how to use the library:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from rag_citation import CiteItem, Inference
|
|
43
|
+
import uuid
|
|
44
|
+
|
|
45
|
+
## Sample context from vectorDB or semantic search
|
|
46
|
+
documents = [
|
|
47
|
+
"Elon MuskCEO, Tesla$221.6B$439M (0.20%)Real Time Net Worthas of 8/6/24Reflects change since 5 pm ET of prior trading day. 1 in the world todayPhoto by Martin Schoeller for ForbesAbout Elon MuskElon Musk cofounded six companies, including electric car maker Tesla, rocket producer SpaceX and tunneling startup Boring Company.He owns about 12% of Tesla excluding options, but has pledged more than half his shares as collateral for personal loans of up to $3.5 billion.In early 2024, a Delaware judge voided Musk's 2018 deal to receive options equaling an additional 9% of Tesla.",
|
|
48
|
+
"people in the world; as of August 2024[update], Forbes estimates his net worth to be US$241 billion.[3] Musk was born in Pretoria to model Maye and businessman and engineer Errol Musk, and briefly attended the University of Pretoria before immigrating to Canada at age 18, acquiring citizenship through his Canadian-born mother. Two years later, he matriculated at Queen's University at Kingston in Canada. Musk later transferred to the University of Pennsylvania and received bachelor's degrees in economics and physics."
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
## Example answer generated by an LLM
|
|
52
|
+
answer = "Elon Musk's net worth is estimated to be US$241 billion as of August 2024."
|
|
53
|
+
|
|
54
|
+
## Helper function to generate a UUID
|
|
55
|
+
def generate_uuid():
|
|
56
|
+
return str(uuid.uuid4())
|
|
57
|
+
|
|
58
|
+
## Helper function to create context in the correct format
|
|
59
|
+
def format_document(documents):
|
|
60
|
+
context = []
|
|
61
|
+
for document in documents:
|
|
62
|
+
context.append(
|
|
63
|
+
{
|
|
64
|
+
"source_id": generate_uuid(),
|
|
65
|
+
"document": document,
|
|
66
|
+
"meta": [{"meta-data": "some-info"}],
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
return context
|
|
70
|
+
|
|
71
|
+
context = format_document(documents)
|
|
72
|
+
cite_item = CiteItem(answer=answer, context=context)
|
|
73
|
+
|
|
74
|
+
## Initialize the Inference
|
|
75
|
+
inference = Inference(spacy_model="sm", embedding_model="md")
|
|
76
|
+
|
|
77
|
+
## Get citation and other information
|
|
78
|
+
output = inference(cite_item)
|
|
79
|
+
|
|
80
|
+
print("------ Citation ------")
|
|
81
|
+
print(output.citation)
|
|
82
|
+
print("------ Hallucination ------")
|
|
83
|
+
print(output.hallucination)
|
|
84
|
+
print("------ Missing Entities ------")
|
|
85
|
+
print(output.missing)
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Output Explanation
|
|
90
|
+
|
|
91
|
+
### `print(output.citation)`
|
|
92
|
+
```
|
|
93
|
+
[
|
|
94
|
+
{
|
|
95
|
+
"answer_sentences": "Elon Musk's net worth is estimated to be US$241 billion as of August 2024.",
|
|
96
|
+
"cite_document": [
|
|
97
|
+
{
|
|
98
|
+
"document": "people in the world; as of August 2024[update], Forbes estimates his net worth to be US$241 billion.[3]",
|
|
99
|
+
"source_id": "23d1f1f0-2afa-4749-8639-78ec685fd837",
|
|
100
|
+
"entity": [
|
|
101
|
+
{
|
|
102
|
+
"word": "US$241 billion",
|
|
103
|
+
"entity_name": "MONEY"
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"word": "August 2024",
|
|
107
|
+
"entity_name": "DATE"
|
|
108
|
+
}
|
|
109
|
+
],
|
|
110
|
+
"meta": [
|
|
111
|
+
{
|
|
112
|
+
"url": "https://www.forbes.com/profile/elon-musk/",
|
|
113
|
+
"chunk_id": "1eab8dd1ffa92906f7fc839862871ca5"
|
|
114
|
+
}
|
|
115
|
+
]
|
|
116
|
+
}
|
|
117
|
+
]
|
|
118
|
+
}
|
|
119
|
+
]
|
|
120
|
+
```
|
|
121
|
+
| Key | Description | Example |
|
|
122
|
+
|--------------------|-------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|
|
|
123
|
+
| `answer_sentences` | Textual information or sentences extracted as answers or relevant information related to the citation.| `"Elon Musk's net worth is estimated to be US$241 billion as of August 2024."` |
|
|
124
|
+
| `cite_document` | List of source documents used in the citation. Each document contains: | |
|
|
125
|
+
| | - `document`: Text from the source document. | `"people in the world; as of August 2024[update], Forbes estimates his net worth to be US$241 billion.[3]"`|
|
|
126
|
+
| | - `source_id`: Unique identifier for the source document. | `"6874d990-fedc-42bd-b0be-730bcdd59d26"` |
|
|
127
|
+
| | - `entity`: List of recognized entities in the document. Each entity contains: | |
|
|
128
|
+
| | - `word`: Recognized word or phrase. | `"US$241 billion"` |
|
|
129
|
+
| | - `entity_name` Type of the entity (e.g., `MONEY`, `DATE`). | `"MONEY"` |
|
|
130
|
+
| | - `meta`Metadata about the document: | `[]` |
|
|
131
|
+
|
|
132
|
+
##### `print(output.hallucination)`
|
|
133
|
+
`False`
|
|
134
|
+
|
|
135
|
+
| Key | Description | Example |
|
|
136
|
+
|------------------|---------------------------------------------------------------------|---------|
|
|
137
|
+
| `hallucination` | Indicates if the output contains hallucinated information. | `false` |
|
|
138
|
+
|
|
139
|
+
### `print(output.missing)`
|
|
140
|
+
` [] `
|
|
141
|
+
|
|
142
|
+
| Key | Description | Example |
|
|
143
|
+
|---------------|---------------------------------------------|---------------------|
|
|
144
|
+
| `missing` | List of entities expected but not found. | `["$100 USD"]` |
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
## Installation
|
|
148
|
+
|
|
149
|
+
**From PyPI:**
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
pip install rag-citation
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**From Source:**
|
|
156
|
+
|
|
157
|
+
1. Clone the repository:
|
|
158
|
+
```bash
|
|
159
|
+
git clone https://github.com/your-username/rag-citation.git
|
|
160
|
+
cd rag-citation
|
|
161
|
+
```
|
|
162
|
+
2. Install the dependencies:
|
|
163
|
+
```bash
|
|
164
|
+
pip install -r requirements.txt
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Configuration
|
|
168
|
+
|
|
169
|
+
The `Inference` class can be configured with different models and settings:
|
|
170
|
+
|
|
171
|
+
- **`spacy_model`:** The spaCy model used for named entity recognition (default: `"en_core_web_sm"`). To use different models, pass:
|
|
172
|
+
- `"sm"` for `en_core_web_sm`
|
|
173
|
+
- `"md"` for `en_core_web_md`
|
|
174
|
+
- `"lg"` for `en_core_web_lg`
|
|
175
|
+
You can download and install spaCy models [here](https://spacy.io/models).
|
|
176
|
+
|
|
177
|
+
- **`embedding_model`:** The sentence embedding model from the SentenceTransformers library used for semantic similarity (default: `"all-mpnet-base-v2"`). To use different models, pass:
|
|
178
|
+
- `"sm"` for `avsolatorio/GIST-small-Embedding-v0`
|
|
179
|
+
- `"md"` for `avsolatorio/GIST-Embedding-v0`
|
|
180
|
+
- `"lg"` for `avsolatorio/GIST-large-Embedding-v0`
|
|
181
|
+
Install SentenceTransformers with: `pip install -U sentence-transformers`
|
|
182
|
+
You can explore the models on [Hugging Face](https://huggingface.co/avsolatorio/GIST-Embedding-v0).
|
|
183
|
+
|
|
184
|
+
- **`threshold`:** The similarity threshold value for semantic matching (current default: `0.88`). You can adjust this value as needed.
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
## Contributing
|
|
188
|
+
|
|
189
|
+
We welcome contributions! Here’s how you can help:
|
|
190
|
+
|
|
191
|
+
- **Report Bugs:** Submit issues on GitHub.
|
|
192
|
+
- **Suggest Features:** Open an issue with your ideas.
|
|
193
|
+
- **Code Contributions:** Fork, make changes, and submit a pull request.
|
|
194
|
+
- **Documentation:** Update and enhance our docs.
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
## License
|
|
198
|
+
|
|
199
|
+
This project is licensed under the [MIT License](LICENSE).
|
|
200
|
+
|
|
201
|
+
## Acknowledgements
|
|
202
|
+
|
|
203
|
+
- [SpaCy](https://spacy.io/)
|
|
204
|
+
- [SentenceTransformers](https://www.sbert.net/)
|
|
205
|
+
- [Huggingface/avsolatorio](https://huggingface.co/avsolatorio)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# RAG Citation: Enhancing AI-Generated Content with Automatic Citations (A Non-LLM Approach)
|
|
2
|
+
|
|
3
|
+
## Project Overview
|
|
4
|
+
|
|
5
|
+
RAG Citation is an project that combines Retrieval-Augmented Generation (RAG) with automatic citation generation. This tool is designed to enhance the credibility of AI-generated content by providing relevant citations for the information used in generating responses.
|
|
6
|
+
|
|
7
|
+
## Key Features
|
|
8
|
+
|
|
9
|
+
- **Non-LLM Approach:** Utilizes efficient algorithms and NLP techniques for citation generation, making it fast and lightweight.
|
|
10
|
+
- **Semantic Search:** Identifies relevant source documents based on meaning and context rather than just keyword matching.
|
|
11
|
+
- **Named Entity Recognition:** Extracts and returns relevant named entities from LLM-generated answers, such as people, organizations, money and dates.
|
|
12
|
+
- **Flexible Integration:** Can be easily integrated into rag pipeline.
|
|
13
|
+
- **Hallucination (Beta)** This beta feature identifies instances where the LLM-generated answer contains entities like ["DATE", "MONEY", "CARDINAL", "ORDINAL", "QUANTITY", "TIME"], but these entities cannot be found within the context. If such a mismatch occurs, it flags the result as a potential hallucination.
|
|
14
|
+
|
|
15
|
+
## Quickstart
|
|
16
|
+
* <b>Langchain example</b>: [langchain.ipynb](https://github.com/rahulanand1103/rag-citation/blob/main/docs/examples/3.example-langchain.ipynb)
|
|
17
|
+
* <b>Embeddchain example</b>: [embeddchain.ipynb](https://github.com/rahulanand1103/rag-citation/blob/main/docs/examples/2.example-embeddchain.ipynb)
|
|
18
|
+
|
|
19
|
+
To get started with `rag-citation`, install it using pip:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install rag-citation
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Here's a basic example demonstrating how to use the library:
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from rag_citation import CiteItem, Inference
|
|
29
|
+
import uuid
|
|
30
|
+
|
|
31
|
+
## Sample context from vectorDB or semantic search
|
|
32
|
+
documents = [
|
|
33
|
+
"Elon MuskCEO, Tesla$221.6B$439M (0.20%)Real Time Net Worthas of 8/6/24Reflects change since 5 pm ET of prior trading day. 1 in the world todayPhoto by Martin Schoeller for ForbesAbout Elon MuskElon Musk cofounded six companies, including electric car maker Tesla, rocket producer SpaceX and tunneling startup Boring Company.He owns about 12% of Tesla excluding options, but has pledged more than half his shares as collateral for personal loans of up to $3.5 billion.In early 2024, a Delaware judge voided Musk's 2018 deal to receive options equaling an additional 9% of Tesla.",
|
|
34
|
+
"people in the world; as of August 2024[update], Forbes estimates his net worth to be US$241 billion.[3] Musk was born in Pretoria to model Maye and businessman and engineer Errol Musk, and briefly attended the University of Pretoria before immigrating to Canada at age 18, acquiring citizenship through his Canadian-born mother. Two years later, he matriculated at Queen's University at Kingston in Canada. Musk later transferred to the University of Pennsylvania and received bachelor's degrees in economics and physics."
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
## Example answer generated by an LLM
|
|
38
|
+
answer = "Elon Musk's net worth is estimated to be US$241 billion as of August 2024."
|
|
39
|
+
|
|
40
|
+
## Helper function to generate a UUID
|
|
41
|
+
def generate_uuid():
|
|
42
|
+
return str(uuid.uuid4())
|
|
43
|
+
|
|
44
|
+
## Helper function to create context in the correct format
|
|
45
|
+
def format_document(documents):
|
|
46
|
+
context = []
|
|
47
|
+
for document in documents:
|
|
48
|
+
context.append(
|
|
49
|
+
{
|
|
50
|
+
"source_id": generate_uuid(),
|
|
51
|
+
"document": document,
|
|
52
|
+
"meta": [{"meta-data": "some-info"}],
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
return context
|
|
56
|
+
|
|
57
|
+
context = format_document(documents)
|
|
58
|
+
cite_item = CiteItem(answer=answer, context=context)
|
|
59
|
+
|
|
60
|
+
## Initialize the Inference
|
|
61
|
+
inference = Inference(spacy_model="sm", embedding_model="md")
|
|
62
|
+
|
|
63
|
+
## Get citation and other information
|
|
64
|
+
output = inference(cite_item)
|
|
65
|
+
|
|
66
|
+
print("------ Citation ------")
|
|
67
|
+
print(output.citation)
|
|
68
|
+
print("------ Hallucination ------")
|
|
69
|
+
print(output.hallucination)
|
|
70
|
+
print("------ Missing Entities ------")
|
|
71
|
+
print(output.missing)
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Output Explanation
|
|
76
|
+
|
|
77
|
+
### `print(output.citation)`
|
|
78
|
+
```
|
|
79
|
+
[
|
|
80
|
+
{
|
|
81
|
+
"answer_sentences": "Elon Musk's net worth is estimated to be US$241 billion as of August 2024.",
|
|
82
|
+
"cite_document": [
|
|
83
|
+
{
|
|
84
|
+
"document": "people in the world; as of August 2024[update], Forbes estimates his net worth to be US$241 billion.[3]",
|
|
85
|
+
"source_id": "23d1f1f0-2afa-4749-8639-78ec685fd837",
|
|
86
|
+
"entity": [
|
|
87
|
+
{
|
|
88
|
+
"word": "US$241 billion",
|
|
89
|
+
"entity_name": "MONEY"
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"word": "August 2024",
|
|
93
|
+
"entity_name": "DATE"
|
|
94
|
+
}
|
|
95
|
+
],
|
|
96
|
+
"meta": [
|
|
97
|
+
{
|
|
98
|
+
"url": "https://www.forbes.com/profile/elon-musk/",
|
|
99
|
+
"chunk_id": "1eab8dd1ffa92906f7fc839862871ca5"
|
|
100
|
+
}
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
]
|
|
104
|
+
}
|
|
105
|
+
]
|
|
106
|
+
```
|
|
107
|
+
| Key | Description | Example |
|
|
108
|
+
|--------------------|-------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|
|
|
109
|
+
| `answer_sentences` | Textual information or sentences extracted as answers or relevant information related to the citation.| `"Elon Musk's net worth is estimated to be US$241 billion as of August 2024."` |
|
|
110
|
+
| `cite_document` | List of source documents used in the citation. Each document contains: | |
|
|
111
|
+
| | - `document`: Text from the source document. | `"people in the world; as of August 2024[update], Forbes estimates his net worth to be US$241 billion.[3]"`|
|
|
112
|
+
| | - `source_id`: Unique identifier for the source document. | `"6874d990-fedc-42bd-b0be-730bcdd59d26"` |
|
|
113
|
+
| | - `entity`: List of recognized entities in the document. Each entity contains: | |
|
|
114
|
+
| | - `word`: Recognized word or phrase. | `"US$241 billion"` |
|
|
115
|
+
| | - `entity_name` Type of the entity (e.g., `MONEY`, `DATE`). | `"MONEY"` |
|
|
116
|
+
| | - `meta`Metadata about the document: | `[]` |
|
|
117
|
+
|
|
118
|
+
##### `print(output.hallucination)`
|
|
119
|
+
`False`
|
|
120
|
+
|
|
121
|
+
| Key | Description | Example |
|
|
122
|
+
|------------------|---------------------------------------------------------------------|---------|
|
|
123
|
+
| `hallucination` | Indicates if the output contains hallucinated information. | `false` |
|
|
124
|
+
|
|
125
|
+
### `print(output.missing)`
|
|
126
|
+
` [] `
|
|
127
|
+
|
|
128
|
+
| Key | Description | Example |
|
|
129
|
+
|---------------|---------------------------------------------|---------------------|
|
|
130
|
+
| `missing` | List of entities expected but not found. | `["$100 USD"]` |
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
## Installation
|
|
134
|
+
|
|
135
|
+
**From PyPI:**
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
pip install rag-citation
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**From Source:**
|
|
142
|
+
|
|
143
|
+
1. Clone the repository:
|
|
144
|
+
```bash
|
|
145
|
+
git clone https://github.com/your-username/rag-citation.git
|
|
146
|
+
cd rag-citation
|
|
147
|
+
```
|
|
148
|
+
2. Install the dependencies:
|
|
149
|
+
```bash
|
|
150
|
+
pip install -r requirements.txt
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Configuration
|
|
154
|
+
|
|
155
|
+
The `Inference` class can be configured with different models and settings:
|
|
156
|
+
|
|
157
|
+
- **`spacy_model`:** The spaCy model used for named entity recognition (default: `"en_core_web_sm"`). To use different models, pass:
|
|
158
|
+
- `"sm"` for `en_core_web_sm`
|
|
159
|
+
- `"md"` for `en_core_web_md`
|
|
160
|
+
- `"lg"` for `en_core_web_lg`
|
|
161
|
+
You can download and install spaCy models [here](https://spacy.io/models).
|
|
162
|
+
|
|
163
|
+
- **`embedding_model`:** The sentence embedding model from the SentenceTransformers library used for semantic similarity (default: `"all-mpnet-base-v2"`). To use different models, pass:
|
|
164
|
+
- `"sm"` for `avsolatorio/GIST-small-Embedding-v0`
|
|
165
|
+
- `"md"` for `avsolatorio/GIST-Embedding-v0`
|
|
166
|
+
- `"lg"` for `avsolatorio/GIST-large-Embedding-v0`
|
|
167
|
+
Install SentenceTransformers with: `pip install -U sentence-transformers`
|
|
168
|
+
You can explore the models on [Hugging Face](https://huggingface.co/avsolatorio/GIST-Embedding-v0).
|
|
169
|
+
|
|
170
|
+
- **`threshold`:** The similarity threshold value for semantic matching (current default: `0.88`). You can adjust this value as needed.
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
## Contributing
|
|
174
|
+
|
|
175
|
+
We welcome contributions! Here’s how you can help:
|
|
176
|
+
|
|
177
|
+
- **Report Bugs:** Submit issues on GitHub.
|
|
178
|
+
- **Suggest Features:** Open an issue with your ideas.
|
|
179
|
+
- **Code Contributions:** Fork, make changes, and submit a pull request.
|
|
180
|
+
- **Documentation:** Update and enhance our docs.
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
## License
|
|
184
|
+
|
|
185
|
+
This project is licensed under the [MIT License](LICENSE).
|
|
186
|
+
|
|
187
|
+
## Acknowledgements
|
|
188
|
+
|
|
189
|
+
- [SpaCy](https://spacy.io/)
|
|
190
|
+
- [SentenceTransformers](https://www.sbert.net/)
|
|
191
|
+
- [Huggingface/avsolatorio](https://huggingface.co/avsolatorio)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from sentence_transformers import SentenceTransformer
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class EmbedddingBaseModel:
|
|
5
|
+
"""
|
|
6
|
+
Base class for embedding models.
|
|
7
|
+
|
|
8
|
+
This class provides a shared instance of a SentenceTransformer model based on the specified size.
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
_model (SentenceTransformer): Static attribute to store the shared SentenceTransformer instance.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
embedding_model (str, optional): Size of the embedding model.
|
|
15
|
+
Choose from "sm" (small), "md" (medium), or "lg" (large).
|
|
16
|
+
Defaults to "sm".
|
|
17
|
+
|
|
18
|
+
default model: `sm`
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
_model = None
|
|
22
|
+
|
|
23
|
+
def __init__(self, embedding_model="sm"):
|
|
24
|
+
|
|
25
|
+
if EmbedddingBaseModel._model is None:
|
|
26
|
+
|
|
27
|
+
if embedding_model == "sm":
|
|
28
|
+
EmbedddingBaseModel._model = SentenceTransformer(
|
|
29
|
+
"avsolatorio/GIST-small-Embedding-v0", revision=None
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
elif embedding_model == "md":
|
|
33
|
+
EmbedddingBaseModel._model = SentenceTransformer(
|
|
34
|
+
"avsolatorio/GIST-Embedding-v0", revision=None
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
elif embedding_model == "lg":
|
|
38
|
+
EmbedddingBaseModel._model = SentenceTransformer(
|
|
39
|
+
"avsolatorio/GIST-large-Embedding-v0", revision=None
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
else:
|
|
43
|
+
print("Warning::please choose `small`, `medium`, or `large`")
|
|
44
|
+
print("Warning::choosing default model: small")
|
|
45
|
+
EmbedddingBaseModel._model = SentenceTransformer(
|
|
46
|
+
"avsolatorio/GIST-small-Embedding-v0", revision=None
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
self.model = EmbedddingBaseModel._model
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import spacy
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SpacyBaseModel:
|
|
5
|
+
_nlp = None
|
|
6
|
+
|
|
7
|
+
def __init__(self, spacy_model="sm"):
|
|
8
|
+
"""
|
|
9
|
+
Base class for loading and sharing a SpaCy language model.
|
|
10
|
+
|
|
11
|
+
This class ensures that only one instance of the specified SpaCy model is loaded,
|
|
12
|
+
regardless of how many times the class is instantiated. This can save memory and time.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
_nlp (spacy.Language): Static attribute to store the shared SpaCy language model.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
spacy_model (str, optional): Size of the SpaCy model to load.
|
|
19
|
+
Choose from "sm" (small), "md" (medium), or "lg" (large).
|
|
20
|
+
Defaults to "sm".
|
|
21
|
+
|
|
22
|
+
default model: `sm`
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
if SpacyBaseModel._nlp is None:
|
|
26
|
+
|
|
27
|
+
if spacy_model == "sm":
|
|
28
|
+
SpacyBaseModel._nlp = spacy.load("en_core_web_sm")
|
|
29
|
+
|
|
30
|
+
elif spacy_model == "md":
|
|
31
|
+
SpacyBaseModel._nlp = spacy.load("en_core_web_md")
|
|
32
|
+
|
|
33
|
+
elif spacy_model == "lg":
|
|
34
|
+
SpacyBaseModel._nlp = spacy.load("en_core_web_lg")
|
|
35
|
+
|
|
36
|
+
else:
|
|
37
|
+
print("Warning::Please provide correct input: sm, md, or lg")
|
|
38
|
+
print("Warning::Running use sm(en_core_web_sm)")
|
|
39
|
+
SpacyBaseModel._nlp = spacy.load("en_core_web_sm")
|
|
40
|
+
|
|
41
|
+
self.nlp = SpacyBaseModel._nlp
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CiteItem:
|
|
5
|
+
"""
|
|
6
|
+
Data structure to hold an answer, its corresponding context, and metadata.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
answer (str): The answer extracted from the context.
|
|
10
|
+
context (list): A list of dictionaries, each representing a source of information.
|
|
11
|
+
Each dictionary must contain:
|
|
12
|
+
- 'document': str, The source text.
|
|
13
|
+
Each dictionary may optionally contain:
|
|
14
|
+
- 'source_id': str, A unique identifier for the source.
|
|
15
|
+
- 'meta': dict, Additional metadata associated with the source.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
answer (str): The answer extracted from the context.
|
|
19
|
+
context (list): The list of context dictionaries.
|
|
20
|
+
meta (dict): A dictionary mapping source_ids to their corresponding metadata.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If answer is not a non-empty string, context is not a non-empty list,
|
|
24
|
+
or any item in the context list doesn't contain a 'document' key.
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> context = [
|
|
28
|
+
... {"document": "This is the first sentence.", "source_id": "doc1", "meta": {"doc_id": "doc_id"}},
|
|
29
|
+
... {"document": "This is the second sentence.", "source_id": "doc2", "meta": {"doc_id": "doc_id"}},
|
|
30
|
+
... ]
|
|
31
|
+
>>> cite_item = CiteItem(answer="some answer", context=context)
|
|
32
|
+
>>> print(cite_item.meta)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, answer: str, context: list):
|
|
36
|
+
# Validate answer input
|
|
37
|
+
if not isinstance(answer, str) or not answer.strip():
|
|
38
|
+
raise ValueError("Answer must be a non-empty string.")
|
|
39
|
+
|
|
40
|
+
# Validate context input
|
|
41
|
+
if not isinstance(context, list) or not context:
|
|
42
|
+
raise ValueError("Context must be a non-empty list.")
|
|
43
|
+
|
|
44
|
+
required_keys = {"document"}
|
|
45
|
+
for item in context:
|
|
46
|
+
if not isinstance(item, dict):
|
|
47
|
+
raise ValueError("Each context item must be a dictionary.")
|
|
48
|
+
missing_keys = required_keys - set(item.keys())
|
|
49
|
+
if missing_keys:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Each context item must contain the following keys: {', '.join(missing_keys)}"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Assign attributes
|
|
55
|
+
self.answer = self._clean_text(answer)
|
|
56
|
+
self.context = [self._clean_text(item) for item in context]
|
|
57
|
+
self.meta = {item.get("source_id"): item.get("meta") for item in context}
|
|
58
|
+
|
|
59
|
+
def _clean_text(self, text):
|
|
60
|
+
# Clean the input text by removing newline characters
|
|
61
|
+
if isinstance(text, str):
|
|
62
|
+
# text = text.replace("\n", "\n ")
|
|
63
|
+
text = re.sub(r"^\d+\.\s*", "", text, flags=re.MULTILINE)
|
|
64
|
+
return text
|
|
65
|
+
elif isinstance(text, dict):
|
|
66
|
+
return {k: self._clean_text(v) for k, v in text.items()}
|
|
67
|
+
else:
|
|
68
|
+
return text
|
|
69
|
+
|
|
70
|
+
def __str__(self):
|
|
71
|
+
# String representation for easy inspection
|
|
72
|
+
attributes = ", ".join(f"{key}={getattr(self, key)}" for key in self.__dict__)
|
|
73
|
+
return f"CiteItem({attributes})"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from sentence_transformers import SentenceTransformer
|
|
2
|
+
from rag_citation.base_model import EmbedddingBaseModel
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EmbeddingModel(EmbedddingBaseModel):
|
|
6
|
+
"""
|
|
7
|
+
Class for generating sentence embeddings using a shared SentenceTransformer model.
|
|
8
|
+
|
|
9
|
+
This class inherits from EmbedddingBaseModel to utilize a shared instance of a
|
|
10
|
+
SentenceTransformer model. It provides a method for generating embeddings for a
|
|
11
|
+
given sentence.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
embedding_model (str, optional): Size of the embedding model.
|
|
15
|
+
Choose from "sm" (small), "md" (medium), or "lg" (large).
|
|
16
|
+
Defaults to "sm".
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, embedding_model="sm"):
|
|
20
|
+
super().__init__(embedding_model)
|
|
21
|
+
|
|
22
|
+
def embedding(self, sentence: str):
|
|
23
|
+
"""
|
|
24
|
+
Generates embeddings for a given sentence.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
sentence (str): The sentence to embed.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
torch.Tensor: A tensor representing the sentence embedding.
|
|
31
|
+
"""
|
|
32
|
+
embeddings = self.model.encode([sentence], convert_to_tensor=True)
|
|
33
|
+
return embeddings
|