kolzchut-ragbot 1.4.2__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kolzchut_ragbot/model.py CHANGED
@@ -1,182 +1,182 @@
1
- import datetime
2
- import logging
3
- import os
4
- from .Document import factory as definitions_factory
5
-
6
- definitions_singleton = definitions_factory()
7
- EMBEDDING_INDEX = os.getenv("ES_EMBEDDING_INDEX", "embeddings")
8
- semantic_models = definitions_singleton.models.keys()
9
-
10
-
11
- def index_from_page_id(page_id: int):
12
- """
13
- Generates an index name based on the page ID.
14
-
15
- Args:
16
- page_id (int): The ID of the page.
17
-
18
- Returns:
19
- str: The generated index name.
20
- """
21
- index_postfix = round(page_id / 1000)
22
- return EMBEDDING_INDEX + "_" + str(index_postfix)
23
-
24
-
25
- def create_mapping():
26
- """
27
- Creates a mapping for the model in Elasticsearch.
28
- """
29
- vector_fields = {f'{semantic_model}_{name}_vectors': {"type": "dense_vector", "dims": 1024}
30
- for name, semantic_model in definitions_singleton.models.items()}
31
-
32
- data_fields = {}
33
- for field in definitions_singleton.saved_fields.keys():
34
- field_type = definitions_singleton.saved_fields[field]
35
- field_mapping = {"type": field_type}
36
- if field_type == "date":
37
- field_mapping["format"] = "yyyyMMddHHmmss"
38
- data_fields[f"{field}"] = field_mapping
39
-
40
- mappings = {
41
- "properties": {
42
- "last_update": {
43
- "type": "date",
44
- },
45
- **vector_fields,
46
- **data_fields,
47
- }
48
- }
49
- return mappings
50
-
51
-
52
- class Model:
53
- """
54
- Represents the model for creating, updating, and searching documents in Elasticsearch.
55
-
56
- Attributes:
57
- custom_result_selection_function (callable): A custom function for selecting search results.
58
- es_client: The Elasticsearch client instance.
59
-
60
- Methods:
61
- create_index():
62
- Creates an index for the model in Elasticsearch.
63
-
64
- create_or_update_documents(paragraphs_dicts: list[dict], update=False):
65
- Creates or updates documents in the Elasticsearch index.
66
-
67
- search(embedded_search: dict[str, list[float]], size=50) -> dict[str, list[dict]]:
68
- Searches for similar documents using cosine similarity.
69
- """
70
-
71
- custom_result_selection_function = None
72
-
73
- def __init__(self, es_client, custom_result_selection_function=None):
74
- """
75
- Initializes the Model instance.
76
-
77
- Args:
78
- es_client: The Elasticsearch client instance.
79
- custom_result_selection_function (callable, optional): A custom function for selecting search results.
80
- """
81
- self.es_client = es_client
82
- if custom_result_selection_function is not None:
83
- self.custom_result_selection_function = custom_result_selection_function
84
-
85
- def create_index(self, index_name):
86
- """
87
- Creates an index for the model in Elasticsearch.
88
- """
89
- mapping = create_mapping()
90
- if not self.es_client.indices.exists(index=index_name):
91
- self.es_client.indices.create(
92
- index=index_name,
93
- mappings=mapping
94
- )
95
-
96
- def create_or_update_documents(self, paragraphs_dicts: list[dict], update=False):
97
- """
98
- Creates or updates documents in the Elasticsearch index.
99
-
100
- Args:
101
- paragraphs_dicts (list[dict]): A list of dictionaries representing the paragraphs to be indexed.
102
- update (bool, optional): Whether to update existing documents. Default is False.
103
- """
104
-
105
- identifier = definitions_singleton.identifier
106
- print(f"Creating or updating documents in the index, {len(paragraphs_dicts)} paragraphs\n")
107
- # Identify the doc from the first paragraph - all paragraphs should have the same doc_id
108
- doc_id = paragraphs_dicts[0][identifier]
109
- index = index_from_page_id(int(doc_id))
110
-
111
- if update:
112
- try:
113
- query = {
114
- "query": {
115
- "match": {
116
- f"{identifier}": doc_id
117
- }
118
- }
119
- }
120
- self.es_client.delete_by_query(index=f"{EMBEDDING_INDEX}*", body=query)
121
-
122
- except Exception as e:
123
- logging.error(f"Error while searching for existing document: {e}")
124
- self.create_index(index)
125
- for i, doc_dict in enumerate(paragraphs_dicts):
126
- print(f"saving paragraph {i + 1} / {len(paragraphs_dicts)}")
127
- doc = {
128
- "last_update": datetime.datetime.now(),
129
- **doc_dict
130
- }
131
-
132
- self.es_client.index(index=index, body=doc)
133
-
134
- def search(self, embedded_search: dict[str, list[float]], size=50) -> dict[str, list[dict]]:
135
- """
136
- Searches for similar documents using cosine similarity.
137
-
138
- Args:
139
- embedded_search (dict[str, list[float]]): A dictionary containing the embedded search vectors.
140
- size (int, optional): The number of search results to return. Default is 50.
141
-
142
- Returns:
143
- dict[str, list[dict]]: A dictionary containing the search results.
144
- """
145
- results = {}
146
- for semantic_model, field in definitions_singleton.models.items():
147
- results[field] = [] if field not in results.keys() else results[field]
148
- body = {
149
- "script_score": {
150
- "query": {
151
- "exists": {
152
- "field": f'{field}_{semantic_model}_vectors'
153
- }
154
- },
155
- "script": {
156
- "source": f"cosineSimilarity(params.query_vector, '{field}_{semantic_model}_vectors') + 1.0",
157
- "params": {
158
- "query_vector": embedded_search[semantic_model]
159
- }
160
- }
161
- }
162
- }
163
- print(f"Searching for {field} using {semantic_model} on index {EMBEDDING_INDEX}\n")
164
- field_results = self.es_client.search(
165
- index=EMBEDDING_INDEX + "*",
166
- body={
167
- "size": size,
168
- "query": body
169
- })
170
- results[field] = results[field] + field_results["hits"]["hits"]
171
-
172
- return results
173
-
174
-
175
- model = None
176
-
177
-
178
- def es_client_factory(es_client) -> Model:
179
- global model
180
- if model is None:
181
- model = Model(es_client)
182
- return model
1
+ import datetime
2
+ import logging
3
+ import os
4
+ from .Document import factory as definitions_factory
5
+
6
+ definitions_singleton = definitions_factory()
7
+ EMBEDDING_INDEX = os.getenv("ES_EMBEDDING_INDEX", "embeddings")
8
+ semantic_models = definitions_singleton.models.keys()
9
+
10
+
11
+ def index_from_page_id(page_id: int):
12
+ """
13
+ Generates an index name based on the page ID.
14
+
15
+ Args:
16
+ page_id (int): The ID of the page.
17
+
18
+ Returns:
19
+ str: The generated index name.
20
+ """
21
+ index_postfix = round(page_id / 1000)
22
+ return EMBEDDING_INDEX + "_" + str(index_postfix)
23
+
24
+
25
+ def create_mapping():
26
+ """
27
+ Creates a mapping for the model in Elasticsearch.
28
+ """
29
+ vector_fields = {f'{semantic_model}_{name}_vectors': {"type": "dense_vector", "dims": 1024}
30
+ for name, semantic_model in definitions_singleton.models.items()}
31
+
32
+ data_fields = {}
33
+ for field in definitions_singleton.saved_fields.keys():
34
+ field_type = definitions_singleton.saved_fields[field]
35
+ field_mapping = {"type": field_type}
36
+ if field_type == "date":
37
+ field_mapping["format"] = "yyyyMMddHHmmss"
38
+ data_fields[f"{field}"] = field_mapping
39
+
40
+ mappings = {
41
+ "properties": {
42
+ "last_update": {
43
+ "type": "date",
44
+ },
45
+ **vector_fields,
46
+ **data_fields,
47
+ }
48
+ }
49
+ return mappings
50
+
51
+
52
+ class Model:
53
+ """
54
+ Represents the model for creating, updating, and searching documents in Elasticsearch.
55
+
56
+ Attributes:
57
+ custom_result_selection_function (callable): A custom function for selecting search results.
58
+ es_client: The Elasticsearch client instance.
59
+
60
+ Methods:
61
+ create_index():
62
+ Creates an index for the model in Elasticsearch.
63
+
64
+ create_or_update_documents(paragraphs_dicts: list[dict], update=False):
65
+ Creates or updates documents in the Elasticsearch index.
66
+
67
+ search(embedded_search: dict[str, list[float]], size=50) -> dict[str, list[dict]]:
68
+ Searches for similar documents using cosine similarity.
69
+ """
70
+
71
+ custom_result_selection_function = None
72
+
73
+ def __init__(self, es_client, custom_result_selection_function=None):
74
+ """
75
+ Initializes the Model instance.
76
+
77
+ Args:
78
+ es_client: The Elasticsearch client instance.
79
+ custom_result_selection_function (callable, optional): A custom function for selecting search results.
80
+ """
81
+ self.es_client = es_client
82
+ if custom_result_selection_function is not None:
83
+ self.custom_result_selection_function = custom_result_selection_function
84
+
85
+ def create_index(self, index_name):
86
+ """
87
+ Creates an index for the model in Elasticsearch.
88
+ """
89
+ mapping = create_mapping()
90
+ if not self.es_client.indices.exists(index=index_name):
91
+ self.es_client.indices.create(
92
+ index=index_name,
93
+ mappings=mapping
94
+ )
95
+
96
+ def create_or_update_documents(self, paragraphs_dicts: list[dict], update=False):
97
+ """
98
+ Creates or updates documents in the Elasticsearch index.
99
+
100
+ Args:
101
+ paragraphs_dicts (list[dict]): A list of dictionaries representing the paragraphs to be indexed.
102
+ update (bool, optional): Whether to update existing documents. Default is False.
103
+ """
104
+
105
+ identifier = definitions_singleton.identifier
106
+ print(f"Creating or updating documents in the index, {len(paragraphs_dicts)} paragraphs\n")
107
+ # Identify the doc from the first paragraph - all paragraphs should have the same doc_id
108
+ doc_id = paragraphs_dicts[0][identifier]
109
+ index = index_from_page_id(int(doc_id))
110
+
111
+ if update:
112
+ try:
113
+ query = {
114
+ "query": {
115
+ "match": {
116
+ f"{identifier}": doc_id
117
+ }
118
+ }
119
+ }
120
+ self.es_client.delete_by_query(index=f"{EMBEDDING_INDEX}*", body=query)
121
+
122
+ except Exception as e:
123
+ logging.error(f"Error while searching for existing document: {e}")
124
+ self.create_index(index)
125
+ for i, doc_dict in enumerate(paragraphs_dicts):
126
+ print(f"saving paragraph {i + 1} / {len(paragraphs_dicts)}")
127
+ doc = {
128
+ "last_update": datetime.datetime.now(),
129
+ **doc_dict
130
+ }
131
+
132
+ self.es_client.index(index=index, body=doc)
133
+
134
+ def search(self, embedded_search: dict[str, list[float]], size=50) -> dict[str, list[dict]]:
135
+ """
136
+ Searches for similar documents using cosine similarity.
137
+
138
+ Args:
139
+ embedded_search (dict[str, list[float]]): A dictionary containing the embedded search vectors.
140
+ size (int, optional): The number of search results to return. Default is 50.
141
+
142
+ Returns:
143
+ dict[str, list[dict]]: A dictionary containing the search results.
144
+ """
145
+ results = {}
146
+ for semantic_model, field in definitions_singleton.models.items():
147
+ results[field] = [] if field not in results.keys() else results[field]
148
+ body = {
149
+ "script_score": {
150
+ "query": {
151
+ "exists": {
152
+ "field": f'{field}_{semantic_model}_vectors'
153
+ }
154
+ },
155
+ "script": {
156
+ "source": f"cosineSimilarity(params.query_vector, '{field}_{semantic_model}_vectors') + 1.0",
157
+ "params": {
158
+ "query_vector": embedded_search[semantic_model]
159
+ }
160
+ }
161
+ }
162
+ }
163
+ print(f"Searching for {field} using {semantic_model} on index {EMBEDDING_INDEX}\n")
164
+ field_results = self.es_client.search(
165
+ index=EMBEDDING_INDEX + "*",
166
+ body={
167
+ "size": size,
168
+ "query": body
169
+ })
170
+ results[field] = results[field] + field_results["hits"]["hits"]
171
+
172
+ return results
173
+
174
+
175
+ model = None
176
+
177
+
178
+ def es_client_factory(es_client) -> Model:
179
+ global model
180
+ if model is None:
181
+ model = Model(es_client)
182
+ return model
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: kolzchut-ragbot
3
+ Version: 1.6.0
4
+ Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
+ Home-page: https://github.com/shmuelrob/rag-bot
6
+ Author: Shmuel Robinov
7
+ Author-email: shmuel_robinov@webiks.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: elasticsearch==8.17.1
14
+ Requires-Dist: sentence-transformers==3.4.1
15
+ Requires-Dist: torch==2.6.0
16
+ Requires-Dist: transformers==4.48.3
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: requires-dist
24
+ Dynamic: requires-python
25
+ Dynamic: summary
26
+
27
+ # kolzchut-ragbot
28
+
29
+ ## Overview
30
+
31
+ This project is a search engine that uses machine learning models and Elasticsearch to provide advanced document retrieval.
32
+ You can use [kolzchut-ragbot](https://github.com/shmuelrob/rag-bot) to demonstrate the engine's document retrieval abilities.
33
+
34
+ ## Features
35
+
36
+ - Document representation and validation
37
+ - Document embedding and indexing in Elasticsearch
38
+ - Advanced search using machine learning model
39
+ - Integration with LLM (Large Language Model) client for query answering
40
+
41
+ ## Installation
42
+
43
+ ### From PyPI
44
+
45
+ ```bash
46
+ pip install kolzchut-ragbot
47
+ ```
48
+
49
+ ### From Source
50
+
51
+ 1. Clone the repository:
52
+
53
+ ```bash
54
+ git clone https://github.com/shmuelrob/rag-bot.git
55
+ cd rag-bot
56
+ ```
57
+
58
+ 2. Create a virtual environment and activate it:
59
+
60
+ ```bash
61
+ python -m venv venv
62
+ source venv/bin/activate # On Windows use: venv\Scripts\activate
63
+ ```
64
+
65
+ 3. Install the required dependencies:
66
+
67
+ ```bash
68
+ pip install -r requirements.txt
69
+ ```
70
+
71
+ ## Configuration
72
+
73
+ Set the following environment variables:
74
+
75
+ - `ES_EMBEDDING_INDEX`: The name of the Elasticsearch index for embeddings.
76
+ - `TOKENIZER_LOCATION`: The location of the tokenizer model.
@@ -0,0 +1,12 @@
1
+ kolzchut_ragbot/Document.py,sha256=5OyBBTZyAJFM_1Pjs3SUC-_s5zEJ5U6wjhw12_FFkdE,3621
2
+ kolzchut_ragbot/IntegrateService.py,sha256=rcwUY2RkclCY3l8BGAmNbstdxhxwhLO9oA8BofqLyts,96
3
+ kolzchut_ragbot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ kolzchut_ragbot/config.py,sha256=uILFvgn9W92-NRaKXYtaoQXpn3KOWKK8SZYRsIAa5Yw,133
5
+ kolzchut_ragbot/engine.py,sha256=PiobcVxrf4cMRaDVZvAadwyOLPx2wAUnpxRqftTNFYU,13848
6
+ kolzchut_ragbot/get_full_documents_utilities.py,sha256=YWljmGWM6h1ghLDCAUnDdhmn-0k6R_t7b1g7wSojzvg,1882
7
+ kolzchut_ragbot/llm_client.py,sha256=Frp7CL0OIlQA6ltohrGWedI6uD6MpGg6TbpZTBE0qIo,341
8
+ kolzchut_ragbot/model.py,sha256=HCi3r4YztPknnbgTOA7I-GVaqxn8CzrTeLFkEg-7fg0,6320
9
+ kolzchut_ragbot-1.6.0.dist-info/METADATA,sha256=pr5ZSZOXk44Yj7TdUPgdlFTLJJG6EjvCUQ73k1vjCRM,1999
10
+ kolzchut_ragbot-1.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ kolzchut_ragbot-1.6.0.dist-info/top_level.txt,sha256=NTZoY4GGw3v_7jm0MgcdHw8simoZ78PsR7Meqmkgd_Q,16
12
+ kolzchut_ragbot-1.6.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,67 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: kolzchut-ragbot
3
- Version: 1.4.2
4
- Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
- Home-page: https://github.com/shmuelrob/rag-bot
6
- Author: Shmuel Robinov
7
- Author-email: shmuel_robinov@webiks.com
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
12
- Description-Content-Type: text/markdown
13
- Requires-Dist: elasticsearch==8.17.1
14
- Requires-Dist: sentence-transformers==3.4.1
15
- Requires-Dist: torch==2.6.0
16
- Requires-Dist: transformers==4.48.3
17
- Dynamic: author
18
- Dynamic: author-email
19
- Dynamic: classifier
20
- Dynamic: description
21
- Dynamic: description-content-type
22
- Dynamic: home-page
23
- Dynamic: requires-dist
24
- Dynamic: requires-python
25
- Dynamic: summary
26
-
27
- # **Webiks-Hebrew-RAGbot**
28
-
29
- ## **Overview**
30
-
31
- This project is a search engine that uses machine learning models and Elasticsearch to provide advanced document retrieval.
32
- You can use [Webiks-Hebrew-RAGbot-Demo](https://github.com/NNLP-IL/Webiks-Hebrew-RAGbot-Demo) to demonstrate the engine's document retrieval abilities
33
-
34
- ## **Features**
35
-
36
- Document representation and validation
37
- Document embedding and indexing in Elasticsearch
38
- Advanced search using machine learning model
39
- Integration with LLM (Large Language Model) client for query answering
40
-
41
- ## **Installation**
42
-
43
- 1. Clone the repository:
44
-
45
- `git clone https://github.com/NNLP-IL/Webiks-Hebrew-RAGbot.git`
46
-
47
- `cd Webiks-Hebrew-RAGbot`
48
-
49
- 2. Create a virtual environment and activate it:  
50
-
51
- `python -m venv venv`
52
-
53
- `source venv/bin/activate`
54
-
55
- On Windows use `\venv\\Scripts\\activate\`
56
-
57
- 3. Install the required dependencies:  
58
-
59
- `pip install -r requirements.txt`
60
-
61
- ## **Configuration**
62
-
63
- Set the following environment variables:  
64
-
65
- ES\_EMBEDDING\_INDEX: The name of the Elasticsearch index for embeddings.
66
-
67
- TOKENIZER\_LOCATION: The location of the tokenizer model.
@@ -1,11 +0,0 @@
1
- kolzchut_ragbot/Document.py,sha256=ySawnD06HA0zHjHp4Y_CPjMMZqLp8onaEgd1dGP5sbs,3722
2
- kolzchut_ragbot/IntegrateService.py,sha256=CqB9vW6W5oj6Ig3crEa6hXqwro21z97UaG9ngxFTzYs,100
3
- kolzchut_ragbot/__init__.py,sha256=KKAc2xjCl5Aui2Cj0FWyvJ51nmnFv7MspLMqOYb-QHA,26
4
- kolzchut_ragbot/config.py,sha256=pcKVJVJ8P2YximjTrmVlrocHXSmzmNu_DFzNoPLa22E,138
5
- kolzchut_ragbot/engine.py,sha256=V8WUWyqvBWbGt-rRRf8G6BEyD-4GjsmtJrxBb6aPon8,10154
6
- kolzchut_ragbot/llm_client.py,sha256=q_cUZq645P7i1PliYzpJRTWlsoSECVIhE-y9wU5eRtQ,352
7
- kolzchut_ragbot/model.py,sha256=M7i9B-zzwa-ATblY-5c7gmbkOXKwS8wWmYMP8l0HE40,6502
8
- kolzchut_ragbot-1.4.2.dist-info/METADATA,sha256=YctMSApfbXBBvx7d0APy28lDWmulO3cfHJKjKOZ_LYQ,2024
9
- kolzchut_ragbot-1.4.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
10
- kolzchut_ragbot-1.4.2.dist-info/top_level.txt,sha256=NTZoY4GGw3v_7jm0MgcdHw8simoZ78PsR7Meqmkgd_Q,16
11
- kolzchut_ragbot-1.4.2.dist-info/RECORD,,