kolzchut-ragbot 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ Metadata-Version: 2.2
2
+ Name: kolzchut-ragbot
3
+ Version: 1.0.1
4
+ Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
+ Home-page: https://github.com/shmuelrob/ragbot
6
+ Author: Shmuel Robinov
7
+ Author-email: shmuel_robinov@webiks.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: elasticsearch==8.14.0
14
+ Requires-Dist: sentence-transformers==3.0.1
15
+ Requires-Dist: torch==2.3.1
16
+ Requires-Dist: transformers==4.42.3
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: requires-dist
24
+ Dynamic: requires-python
25
+ Dynamic: summary
26
+
27
+ # **Webiks-Hebrew-RAGbot**
28
+
29
+ ## **Overview**
30
+
31
+ This project is a search engine that uses machine learning models and Elasticsearch to provide advanced document retrieval.
32
+ You can use [Webiks-Hebrew-RAGbot-Demo](https://github.com/NNLP-IL/Webiks-Hebrew-RAGbot-Demo) to demonstrate the engine's document retrieval abilities
33
+
34
+ ## **Features**
35
+
36
+ Document representation and validation
37
+ Document embedding and indexing in Elasticsearch
38
+ Advanced search using machine learning model
39
+ Integration with LLM (Large Language Model) client for query answering
40
+
41
+ ## **Installation**
42
+
43
+ 1. Clone the repository:
44
+
45
+ `git clone https://github.com/NNLP-IL/Webiks-Hebrew-RAGbot.git`
46
+
47
+ `cd Webiks-Hebrew-RAGbot`
48
+
49
+ 2. Create a virtual environment and activate it:  
50
+
51
+ `python -m venv venv`
52
+
53
+ `source venv/bin/activate`
54
+
55
+ On Windows use `\venv\\Scripts\\activate\`
56
+
57
+ 3. Install the required dependencies:  
58
+
59
+ `pip install -r requirements.txt`
60
+
61
+ ## **Configuration**
62
+
63
+ Set the following environment variables:  
64
+
65
+ ES\_EMBEDDING\_INDEX: The name of the Elasticsearch index for embeddings.
66
+
67
+ TOKENIZER\_LOCATION: The location of the tokenizer model.
@@ -0,0 +1,41 @@
1
+ # **Webiks-Hebrew-RAGbot**
2
+
3
+ ## **Overview**
4
+
5
+ This project is a search engine that uses machine learning models and Elasticsearch to provide advanced document retrieval.
6
+ You can use [Webiks-Hebrew-RAGbot-Demo](https://github.com/NNLP-IL/Webiks-Hebrew-RAGbot-Demo) to demonstrate the engine's document retrieval abilities
7
+
8
+ ## **Features**
9
+
10
+ Document representation and validation
11
+ Document embedding and indexing in Elasticsearch
12
+ Advanced search using machine learning model
13
+ Integration with LLM (Large Language Model) client for query answering
14
+
15
+ ## **Installation**
16
+
17
+ 1. Clone the repository:
18
+
19
+ `git clone https://github.com/NNLP-IL/Webiks-Hebrew-RAGbot.git`
20
+
21
+ `cd Webiks-Hebrew-RAGbot`
22
+
23
+ 2. Create a virtual environment and activate it:  
24
+
25
+ `python -m venv venv`
26
+
27
+ `source venv/bin/activate`
28
+
29
+ On Windows use `\venv\\Scripts\\activate\`
30
+
31
+ 3. Install the required dependencies:  
32
+
33
+ `pip install -r requirements.txt`
34
+
35
+ ## **Configuration**
36
+
37
+ Set the following environment variables:  
38
+
39
+ ES\_EMBEDDING\_INDEX: The name of the Elasticsearch index for embeddings.
40
+
41
+ TOKENIZER\_LOCATION: The location of the tokenizer model.
@@ -0,0 +1,67 @@
1
+ Metadata-Version: 2.2
2
+ Name: kolzchut-ragbot
3
+ Version: 1.0.1
4
+ Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
+ Home-page: https://github.com/shmuelrob/ragbot
6
+ Author: Shmuel Robinov
7
+ Author-email: shmuel_robinov@webiks.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: elasticsearch==8.14.0
14
+ Requires-Dist: sentence-transformers==3.0.1
15
+ Requires-Dist: torch==2.3.1
16
+ Requires-Dist: transformers==4.42.3
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: requires-dist
24
+ Dynamic: requires-python
25
+ Dynamic: summary
26
+
27
+ # **Webiks-Hebrew-RAGbot**
28
+
29
+ ## **Overview**
30
+
31
+ This project is a search engine that uses machine learning models and Elasticsearch to provide advanced document retrieval.
32
+ You can use [Webiks-Hebrew-RAGbot-Demo](https://github.com/NNLP-IL/Webiks-Hebrew-RAGbot-Demo) to demonstrate the engine's document retrieval abilities
33
+
34
+ ## **Features**
35
+
36
+ Document representation and validation
37
+ Document embedding and indexing in Elasticsearch
38
+ Advanced search using machine learning model
39
+ Integration with LLM (Large Language Model) client for query answering
40
+
41
+ ## **Installation**
42
+
43
+ 1. Clone the repository:
44
+
45
+ `git clone https://github.com/NNLP-IL/Webiks-Hebrew-RAGbot.git`
46
+
47
+ `cd Webiks-Hebrew-RAGbot`
48
+
49
+ 2. Create a virtual environment and activate it:  
50
+
51
+ `python -m venv venv`
52
+
53
+ `source venv/bin/activate`
54
+
55
+ On Windows use `\venv\\Scripts\\activate\`
56
+
57
+ 3. Install the required dependencies:  
58
+
59
+ `pip install -r requirements.txt`
60
+
61
+ ## **Configuration**
62
+
63
+ Set the following environment variables:  
64
+
65
+ ES\_EMBEDDING\_INDEX: The name of the Elasticsearch index for embeddings.
66
+
67
+ TOKENIZER\_LOCATION: The location of the tokenizer model.
@@ -0,0 +1,12 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ kolzchut_ragbot.egg-info/PKG-INFO
5
+ kolzchut_ragbot.egg-info/SOURCES.txt
6
+ kolzchut_ragbot.egg-info/dependency_links.txt
7
+ kolzchut_ragbot.egg-info/requires.txt
8
+ kolzchut_ragbot.egg-info/top_level.txt
9
+ test/test_configs.py
10
+ test/test_document.py
11
+ test/test_engine.py
12
+ test/test_model.py
@@ -0,0 +1,4 @@
1
+ elasticsearch==8.14.0
2
+ sentence-transformers==3.0.1
3
+ torch==2.3.1
4
+ transformers==4.42.3
@@ -0,0 +1,33 @@
1
+ # pyproject.toml
2
+
3
+ [build-system]
4
+ requires = ["poetry-core>=1.0.0"]
5
+ build-backend = "poetry.core.masonry.api"
6
+
7
+ [tool.bumpver]
8
+ current_version = "1.0.53"
9
+ version_pattern = "MAJOR.MINOR.PATCH"
10
+ commit_message = "Bump version {old_version} -> {new_version}"
11
+ commit = true
12
+ tag = true
13
+ push = false
14
+
15
+ [tool.poetry]
16
+ name = "ragbot"
17
+ version = "1.0.53"
18
+ description = ""
19
+ authors = ["Your Name <your.email@example.com>"]
20
+
21
+ [tool.poetry.dependencies]
22
+ python = "^3.7"
23
+ elasticsearch = ">=8.10.1"
24
+ tomli = { version = "*", python = "<3.11" }
25
+ transformers = ">=4.41.0"
26
+ sentence_transformers = ">=2.2.2"
27
+ torch = ">=2.1.2"
28
+
29
+ [tool.bumpver.file_patterns]
30
+ "pyproject.toml" = [
31
+ 'current_version = "{version}"',
32
+ ]
33
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,25 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='kolzchut-ragbot',
5
+ version='1.0.1',
6
+ author='Shmuel Robinov',
7
+ author_email='shmuel_robinov@webiks.com',
8
+ description='A search engine using machine learning models and Elasticsearch for advanced document retrieval.',
9
+ long_description=open('README.md').read(),
10
+ long_description_content_type='text/markdown',
11
+ url='https://github.com/shmuelrob/ragbot',
12
+ packages=find_packages(),
13
+ install_requires=[
14
+ 'elasticsearch==8.14.0',
15
+ 'sentence-transformers==3.0.1',
16
+ 'torch==2.3.1',
17
+ 'transformers==4.42.3'
18
+ ],
19
+ classifiers=[
20
+ 'Programming Language :: Python :: 3',
21
+ 'License :: OSI Approved :: MIT License',
22
+ 'Operating System :: OS Independent',
23
+ ],
24
+ python_requires='>=3.10',
25
+ )
@@ -0,0 +1,42 @@
1
+ bad_conf_1 = '''
2
+ {
3
+ "identifier_field": "no_such_field",
4
+ "saved_fields": {
5
+ "title": "text",
6
+ "doc_id": "integer",
7
+ "content": "text"
8
+ },
9
+ "models": {
10
+ "me5_large-v10": "content"
11
+ }
12
+ }
13
+ '''
14
+
15
+ bad_conf_2 = '''
16
+ {
17
+ "identifier_field": "doc_id",
18
+ "saved_fields": {
19
+ "title": "text",
20
+ "doc_id": "integer",
21
+ "content": "text"
22
+ },
23
+ "models": {
24
+ "me5_large-v10": "content",
25
+ "OMG": "no_such_field"
26
+ }
27
+ }
28
+ '''
29
+
30
+ good_conf = '''
31
+ {
32
+ "identifier_field": "doc_id",
33
+ "saved_fields": {
34
+ "title": "text",
35
+ "doc_id": "integer",
36
+ "content": "text"
37
+ },
38
+ "models": {
39
+ "me5_large-v10": "content"
40
+ }
41
+ }
42
+ '''
@@ -0,0 +1,54 @@
1
+ import unittest
2
+ from unittest.mock import patch, mock_open
3
+ import json
4
+ import sys
5
+ import os
6
+
7
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src')))
8
+
9
+ from ragbot.Document import initialize_definitions
10
+
11
+
12
+ class TestDocument(unittest.TestCase):
13
+
14
+ @classmethod
15
+ def setUpClass(cls):
16
+ pass
17
+
18
+ @patch('builtins.open', new_callable=mock_open, read_data=json.dumps({
19
+ "saved_fields": {"doc_id": "doc_id"},
20
+ "models": {"model1": "doc_id"},
21
+ "identifier_field": "doc_id"
22
+ }))
23
+ @patch('os.getenv', return_value='example-conf.json')
24
+ def test_initialize_definitions_good(self, mock_getenv, mock_open):
25
+ definitions = initialize_definitions()
26
+ self.assertEqual(definitions.saved_fields, {"doc_id": "doc_id"})
27
+ self.assertEqual(definitions.models, {"model1": "doc_id"})
28
+ self.assertEqual(definitions.identifier, "doc_id")
29
+
30
+ @patch('builtins.open', new_callable=mock_open, read_data=json.dumps({
31
+ "saved_fields": {"field1": {"field_name": "field1", "required": True}},
32
+ "models": {"model1": "field1"},
33
+ "identifier_field": "no_such_field"
34
+ }))
35
+ @patch('os.getenv', return_value='example-conf.json')
36
+ def test_initialize_definitions_bad_identifier(self, mock_getenv, mock_open):
37
+ with self.assertRaises(ValueError) as context:
38
+ initialize_definitions()
39
+ self.assertIn("identifier_field must be one of the saved fields", str(context.exception))
40
+
41
+ @patch('builtins.open', new_callable=mock_open, read_data=json.dumps({
42
+ "saved_fields": {"field1": {"field_name": "field1", "required": True}},
43
+ "models": "no_such_field",
44
+ "identifier_field": "doc_id"
45
+ }))
46
+ @patch('os.getenv', return_value='example-conf.json')
47
+ def test_initialize_definitions_bad_model(self, mock_getenv, mock_open):
48
+ with self.assertRaises(ValueError) as context:
49
+ initialize_definitions()
50
+ self.assertIn("must be one of the saved fields", str(context.exception))
51
+
52
+
53
+ if __name__ == '__main__':
54
+ unittest.main()
@@ -0,0 +1,172 @@
1
+ import unittest
2
+ import importlib
3
+ from unittest.mock import patch, MagicMock, ANY
4
+ import sys
5
+ import os
6
+
7
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src')))
8
+ import ragbot.engine
9
+ importlib.reload(ragbot.engine)
10
+ def build_test_engine(es_model, llm_client):
11
+ reranker_model = MagicMock()
12
+ reranker_tokenizer = MagicMock()
13
+ models = MagicMock()
14
+ return ragbot.engine.Engine(llms_client=llm_client, es_client=es_model, models=models,
15
+ reranker_model=reranker_model, reranker_tokenizer=reranker_tokenizer)
16
+
17
+
18
+ class TestEngine(unittest.TestCase):
19
+
20
+ @patch('ragbot.llm_client.LLMClient')
21
+ @patch('ragbot.model.Model')
22
+ @patch('elasticsearch.Elasticsearch')
23
+ def test_update_title_summary(self, Elasticsearch, Model, LLMClient):
24
+ list_of_docs = [
25
+ {"doc_id": 1, 'title': 'title1', 'summary': 'summary1', 'content': 'content1'},
26
+ {"doc_id": 2, 'title': 'title2', 'summary': 'summary2', 'content': 'content2'}
27
+ ]
28
+
29
+ es_client = Elasticsearch()
30
+ es_model = Model(es_client)
31
+ llm_client = LLMClient()
32
+ engine = build_test_engine(es_model, llm_client)
33
+ engine.update_docs(list_of_docs=list_of_docs, embed_only_fields=['title', 'summary'], delete_existing=False)
34
+ es_model.create_or_update_documents.assert_called_once_with(list_of_docs, False)
35
+
36
+ @patch('ragbot.llm_client.LLMClient')
37
+ @patch('ragbot.model.Model')
38
+ @patch('elasticsearch.Elasticsearch')
39
+ def test_update_content_without_delete(self, Elasticsearch, Model, LLMClient):
40
+ list_of_docs = [
41
+ {"doc_id": 1, 'title': 'title1', 'summary': 'summary1', 'content': 'content1'},
42
+ {"doc_id": 2, 'title': 'title2', 'summary': 'summary2', 'content': 'content2'}
43
+ ]
44
+ es_client = Elasticsearch()
45
+ es_model = Model(es_client)
46
+ llm_client = LLMClient()
47
+ engine = build_test_engine(es_model, llm_client)
48
+ engine.update_docs(list_of_docs, embed_only_fields=['content'], delete_existing=False)
49
+ es_model.create_or_update_documents.assert_called_once_with(list_of_docs, False)
50
+
51
+ @patch('ragbot.llm_client.LLMClient')
52
+ @patch('ragbot.model.Model')
53
+ @patch('elasticsearch.Elasticsearch')
54
+ def test_update_content_with_delete(self, Elasticsearch, Model, LLMClient):
55
+ list_of_docs = [
56
+ {"doc_id": 1, 'title': 'title1', 'summary': 'summary1', 'content': 'content1'},
57
+ {"doc_id": 2, 'title': 'title2', 'summary': 'summary2', 'content': 'content2'}
58
+ ]
59
+ reranker_model = MagicMock()
60
+ reranker_tokenizer = MagicMock()
61
+ llm_client = LLMClient()
62
+ models = MagicMock()
63
+ es_client = Elasticsearch()
64
+ es_model = Model(es_client)
65
+ engine = ragbot.engine.Engine(llms_client=llm_client, es_client=es_model, models=models,
66
+ reranker_model=reranker_model, reranker_tokenizer=reranker_tokenizer)
67
+ engine.update_docs(list_of_docs, embed_only_fields=['content'], delete_existing=True)
68
+ es_model.create_or_update_documents.assert_called_once_with(list_of_docs, True)
69
+
70
+ @patch('ragbot.llm_client.LLMClient')
71
+ @patch('ragbot.model.Model')
72
+ @patch('elasticsearch.Elasticsearch')
73
+ def test_reciprocal_rank_fusion(self, Elasticsearch, Model, LLMClient):
74
+ es_client = Elasticsearch()
75
+ es_model = Model(es_client)
76
+ llm_client = LLMClient()
77
+ engine = build_test_engine(es_model, llm_client)
78
+
79
+ ranking_lists = [
80
+ [1, 2, 3],
81
+ [2, 3, 4],
82
+ [3, 4, 5]
83
+ ]
84
+ expected_fused_list = [3,2,4,1,5]
85
+ fused_list = engine.reciprocal_rank_fusion(ranking_lists)
86
+ self.assertEqual(fused_list, expected_fused_list)
87
+
88
+ @patch.object(ragbot.engine.Engine, 'reciprocal_rank_fusion')
89
+ @patch('ragbot.llm_client.LLMClient')
90
+ @patch('ragbot.model.Model')
91
+ @patch('elasticsearch.Elasticsearch')
92
+ def test_search_documents(self, Elasticsearch, Model, LLMClient, mock_reciprocal_rank_fusion):
93
+ llm_client = LLMClient()
94
+ models = MagicMock()
95
+ es_client = Elasticsearch()
96
+ es_model = Model(es_client)
97
+ es_model.search.return_value = {
98
+ "title":[
99
+ {'_source': {'page_id': 1, 'title': 'title1'}},
100
+ {'_source': {'page_id': 2, 'title': 'title2'}},
101
+ {'_source': {'page_id': 3, 'title': 'title3'}},
102
+ ],
103
+ "summary":[
104
+ {'_source': {'page_id': 2, 'title': 'title2'}},
105
+ {'_source': {'page_id': 3, 'title': 'title3'}},
106
+ {'_source': {'page_id': 4, 'title': 'title4'}},
107
+ ],
108
+ "content":[
109
+ {'_source': {'page_id': 3, 'title': 'title3'}},
110
+ {'_source': {'page_id': 4, 'title': 'title4'}},
111
+ {'_source': {'page_id': 5, 'title': 'title5'}}
112
+ ]
113
+ }
114
+ mock_reciprocal_rank_fusion.return_value = [3, 2, 4, 1, 5]
115
+ engine = build_test_engine(es_model, llm_client)
116
+
117
+ result = engine.search_documents("test query", 5)
118
+
119
+ es_model.search.assert_called_once()
120
+ self.assertEqual(mock_reciprocal_rank_fusion.return_value, [3, 2, 4, 1, 5])
121
+ self.assertEqual([
122
+ {'page_id': 3, 'title': 'title3'},
123
+ {'page_id': 2, 'title': 'title2'},
124
+ {'page_id': 4, 'title': 'title4'},
125
+ {'page_id': 1, 'title': 'title1'},
126
+ {'page_id': 5, 'title': 'title5'}
127
+ ], result)
128
+
129
+ @patch('ragbot.llm_client.LLMClient')
130
+ @patch('ragbot.model.Model')
131
+ @patch('elasticsearch.Elasticsearch')
132
+ def test_answer_query(self, Elasticsearch, Model, LLMClient):
133
+ es_client = Elasticsearch()
134
+ es_model = Model(es_client)
135
+ llm_client = LLMClient()
136
+ engine = build_test_engine(es_model, llm_client)
137
+
138
+ with patch.object(ragbot.engine.Engine, 'search_documents') as mock_search_documents:
139
+ mock_search_documents.return_value = [
140
+ {'page_id': 3, 'title': 'title3'},
141
+ {'page_id': 2, 'title': 'title2'},
142
+ {'page_id': 4, 'title': 'title4'},
143
+ {'page_id': 1, 'title': 'title1'},
144
+ {'page_id': 5, 'title': 'title5'}
145
+ ]
146
+
147
+ llm_client.answer.return_value = ('answer', 0.5, 100)
148
+ actual_top_k_documents, actual_gpt_answer, actual_stats = engine.answer_query("test query", 5, 'gpt-4o')
149
+
150
+ expected_top_k_documents = [
151
+ {'page_id': 3, 'title': 'title3'},
152
+ {'page_id': 2, 'title': 'title2'},
153
+ {'page_id': 4, 'title': 'title4'},
154
+ {'page_id': 1, 'title': 'title1'},
155
+ {'page_id': 5, 'title': 'title5'}
156
+ ]
157
+ expected_gpt_answer = llm_client.answer.return_value[0]
158
+ expected_stats = {
159
+ "retrieval_time": 0,
160
+ "gpt_model": 'gpt-4o',
161
+ "gpt_time": llm_client.answer.return_value[1],
162
+ "tokens": llm_client.answer.return_value[2]
163
+ }
164
+
165
+ self.assertEqual(expected_top_k_documents, actual_top_k_documents)
166
+ self.assertEqual(expected_gpt_answer, actual_gpt_answer)
167
+ self.assertEqual(expected_stats, actual_stats)
168
+
169
+
170
+ if __name__ == '__main__':
171
+ unittest.main()
172
+
@@ -0,0 +1,153 @@
1
+ import unittest
2
+ from unittest.mock import patch, ANY
3
+ import sys
4
+ import os
5
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src')))
6
+ from ragbot.model import Model, index_from_page_id
7
+
8
+ search_json = {
9
+ "script_score": {
10
+ "query": {
11
+ "exists": {
12
+ "field": 'content-me5_large-v10'
13
+ }
14
+ },
15
+ "script": {
16
+ "source": f"cosineSimilarity(params.query_vector, 'content-me5_large-v10') + 1.0",
17
+ "params": {
18
+ "Query_vector": [0.0, 0.11, ]
19
+ }
20
+ }
21
+ }
22
+ }
23
+
24
+
25
+ class TestModel(unittest.TestCase):
26
+
27
+ @patch('elasticsearch.Elasticsearch')
28
+ def test_create_index(self, EsMock):
29
+ es_mock = EsMock()
30
+ model = Model(es_mock)
31
+ expected_index_mapping = {
32
+ 'properties': {
33
+ 'last_update': {'type': 'date'},
34
+ 'me5_large-v10_content_vectors': {'type': 'dense_vector', 'dims': 1024},
35
+ 'title': {'type': 'text'}, 'doc_id': {'type': 'integer'}, 'content': {'type': 'text'}}
36
+ }
37
+ es_mock.indices.exists.return_value = False
38
+ model.create_index()
39
+ self.assertTrue(es_mock.indices.create.called)
40
+ _, kwargs = es_mock.indices.create.call_args
41
+ self.assertEqual(expected_index_mapping, kwargs.get("mappings"))
42
+
43
+ @patch('elasticsearch.Elasticsearch')
44
+ def test_create_index_false(self, EsMock):
45
+ es_mock = EsMock()
46
+ model = Model(es_mock)
47
+ es_mock.indices.exists.return_value = True
48
+ model.create_index()
49
+ self.assertTrue(es_mock.indices.create.not_called)
50
+
51
+ @patch('elasticsearch.Elasticsearch')
52
+ def test_create_or_update_document_no_delete(self, EsMock):
53
+ es_mock = EsMock()
54
+ model = Model(es_mock)
55
+ es_mock.search.return_value = {"hits": {"hits": []}}
56
+ new_doc = {"doc_id": 1, "title": "title", "content": "content"}
57
+ model.create_or_update_documents([new_doc], True)
58
+
59
+ es_mock.search.assert_called_with(
60
+ index=index_from_page_id(1),
61
+ body={
62
+ "query": {
63
+ "term": {
64
+ "doc_id": {"value": 1}
65
+ }
66
+ }
67
+ })
68
+ self.assertTrue(es_mock.delete.call_count == 0)
69
+ es_mock.index.called_once()
70
+
71
+ @patch('elasticsearch.Elasticsearch')
72
+ def test_create_or_update_document_but_delete(self, EsMock):
73
+ es_mock = EsMock()
74
+ model = Model(es_mock)
75
+ es_mock.search.return_value = {"hits": {"hits": [{"_id":"1","doc_id": 1, "title": "title", "content": "content"}]}}
76
+ new_doc = {"doc_id": 1, "title": "edited", "content": "edited"}
77
+ model.create_or_update_documents([new_doc], True)
78
+
79
+ es_mock.search.assert_called_with(
80
+ index=index_from_page_id(1),
81
+ body={
82
+ "query": {
83
+ "term": {
84
+ "doc_id": {"value": 1}
85
+ }
86
+ }
87
+ })
88
+ self.assertTrue(es_mock.delete.call_count == 1)
89
+ es_mock.index.called_once()
90
+
91
+ @patch('elasticsearch.Elasticsearch')
92
+ def test_create_or_update_document_delete_false(self, EsMock):
93
+ es_mock = EsMock()
94
+ model = Model(es_mock)
95
+ es_mock.search.return_value = {
96
+ "hits": {"hits": [{"_id": "1", "doc_id": 1, "title": "title", "content": "content"}]}}
97
+ new_doc = {"doc_id": 1, "title": "edited", "content": "edited"}
98
+ model.create_or_update_documents([new_doc], False)
99
+
100
+ self.assertEqual(0, es_mock.search.call_count)
101
+ self.assertEqual(0, es_mock.delete.call_count)
102
+ es_mock.index.called_once()
103
+
104
+ @patch('elasticsearch.Elasticsearch')
105
+ def test_search(self, EsMock):
106
+ es_mock = EsMock()
107
+ model = Model(es_mock)
108
+ es_mock.search.return_value = {
109
+ "hits": {
110
+ "hits": [
111
+ {"_id": "1", "_source": {"field": "value1"}},
112
+ {"_id": "2", "_source": {"field": "value2"}}
113
+ ]
114
+ }
115
+ }
116
+
117
+ embedded_search = {
118
+ "me5_large-v10": [0.1, 0.2, 0.3],
119
+ "model2": [0.4, 0.5, 0.6]
120
+ }
121
+
122
+ results = model.search(embedded_search, size=2)
123
+
124
+ expected_query = {
125
+ "size": 2,
126
+ "query": {
127
+ "script_score": {
128
+ "query": {
129
+ "exists": {
130
+ "field": "doc_id_me5_large-v10_vectors"
131
+ }
132
+ },
133
+ "script": {
134
+ "source": "cosineSimilarity(params.query_vector, 'doc_id_me5_large-v10_vectors') + 1.0",
135
+ "params": {
136
+ "query_vector": embedded_search["me5_large-v10"]
137
+ }
138
+ }
139
+ }
140
+ }
141
+ }
142
+
143
+ expected_results = {
144
+ "doc_id": [
145
+ {"_id": "1", "_source": {"field": "value1"}},
146
+ {"_id": "2", "_source": {"field": "value2"}}
147
+ ]
148
+ }
149
+ self.assertEqual(results["content"], expected_results["doc_id"])
150
+
151
+
152
+ if __name__ == '__main__':
153
+ unittest.main()