kolzchut-ragbot 1.4.1__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. kolzchut_ragbot-1.5.0/PKG-INFO +76 -0
  2. kolzchut_ragbot-1.5.0/README.md +50 -0
  3. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot/Document.py +101 -101
  4. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot/IntegrateService.py +4 -4
  5. kolzchut_ragbot-1.5.0/kolzchut_ragbot/__init__.py +0 -0
  6. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot/config.py +5 -5
  7. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot/engine.py +254 -246
  8. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot/llm_client.py +11 -11
  9. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot/model.py +182 -179
  10. kolzchut_ragbot-1.5.0/kolzchut_ragbot.egg-info/PKG-INFO +76 -0
  11. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot.egg-info/SOURCES.txt +1 -0
  12. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/pyproject.toml +33 -33
  13. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/setup.cfg +4 -4
  14. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/setup.py +30 -30
  15. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/test/test_configs.py +41 -41
  16. kolzchut_ragbot-1.5.0/test/test_docs.py +48 -0
  17. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/test/test_document.py +46 -54
  18. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/test/test_engine.py +159 -172
  19. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/test/test_model.py +143 -153
  20. kolzchut_ragbot-1.4.1/PKG-INFO +0 -67
  21. kolzchut_ragbot-1.4.1/README.md +0 -41
  22. kolzchut_ragbot-1.4.1/kolzchut_ragbot/__init__.py +0 -2
  23. kolzchut_ragbot-1.4.1/kolzchut_ragbot.egg-info/PKG-INFO +0 -67
  24. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot.egg-info/dependency_links.txt +0 -0
  25. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot.egg-info/requires.txt +0 -0
  26. {kolzchut_ragbot-1.4.1 → kolzchut_ragbot-1.5.0}/kolzchut_ragbot.egg-info/top_level.txt +0 -0
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: kolzchut-ragbot
3
+ Version: 1.5.0
4
+ Summary: A search engine using machine learning models and Elasticsearch for advanced document retrieval.
5
+ Home-page: https://github.com/shmuelrob/rag-bot
6
+ Author: Shmuel Robinov
7
+ Author-email: shmuel_robinov@webiks.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: elasticsearch==8.17.1
14
+ Requires-Dist: sentence-transformers==3.4.1
15
+ Requires-Dist: torch==2.6.0
16
+ Requires-Dist: transformers==4.48.3
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: requires-dist
24
+ Dynamic: requires-python
25
+ Dynamic: summary
26
+
27
+ # kolzchut-ragbot
28
+
29
+ ## Overview
30
+
31
+ This project is a search engine that uses machine learning models and Elasticsearch to provide advanced document retrieval.
32
+ You can use [kolzchut-ragbot](https://github.com/shmuelrob/rag-bot) to demonstrate the engine's document retrieval abilities.
33
+
34
+ ## Features
35
+
36
+ - Document representation and validation
37
+ - Document embedding and indexing in Elasticsearch
38
+ - Advanced search using machine learning model
39
+ - Integration with LLM (Large Language Model) client for query answering
40
+
41
+ ## Installation
42
+
43
+ ### From PyPI
44
+
45
+ ```bash
46
+ pip install kolzchut-ragbot
47
+ ```
48
+
49
+ ### From Source
50
+
51
+ 1. Clone the repository:
52
+
53
+ ```bash
54
+ git clone https://github.com/shmuelrob/rag-bot.git
55
+ cd rag-bot
56
+ ```
57
+
58
+ 2. Create a virtual environment and activate it:
59
+
60
+ ```bash
61
+ python -m venv venv
62
+ source venv/bin/activate # On Windows use: venv\Scripts\activate
63
+ ```
64
+
65
+ 3. Install the required dependencies:
66
+
67
+ ```bash
68
+ pip install -r requirements.txt
69
+ ```
70
+
71
+ ## Configuration
72
+
73
+ Set the following environment variables:
74
+
75
+ - `ES_EMBEDDING_INDEX`: The name of the Elasticsearch index for embeddings.
76
+ - `TOKENIZER_LOCATION`: The location of the tokenizer model.
@@ -0,0 +1,50 @@
1
+ # kolzchut-ragbot
2
+
3
+ ## Overview
4
+
5
+ This project is a search engine that uses machine learning models and Elasticsearch to provide advanced document retrieval.
6
+ You can use [kolzchut-ragbot](https://github.com/shmuelrob/rag-bot) to demonstrate the engine's document retrieval abilities.
7
+
8
+ ## Features
9
+
10
+ - Document representation and validation
11
+ - Document embedding and indexing in Elasticsearch
12
+ - Advanced search using machine learning model
13
+ - Integration with LLM (Large Language Model) client for query answering
14
+
15
+ ## Installation
16
+
17
+ ### From PyPI
18
+
19
+ ```bash
20
+ pip install kolzchut-ragbot
21
+ ```
22
+
23
+ ### From Source
24
+
25
+ 1. Clone the repository:
26
+
27
+ ```bash
28
+ git clone https://github.com/shmuelrob/rag-bot.git
29
+ cd rag-bot
30
+ ```
31
+
32
+ 2. Create a virtual environment and activate it:
33
+
34
+ ```bash
35
+ python -m venv venv
36
+ source venv/bin/activate # On Windows use: venv\Scripts\activate
37
+ ```
38
+
39
+ 3. Install the required dependencies:
40
+
41
+ ```bash
42
+ pip install -r requirements.txt
43
+ ```
44
+
45
+ ## Configuration
46
+
47
+ Set the following environment variables:
48
+
49
+ - `ES_EMBEDDING_INDEX`: The name of the Elasticsearch index for embeddings.
50
+ - `TOKENIZER_LOCATION`: The location of the tokenizer model.
@@ -1,101 +1,101 @@
1
- import json
2
- import os
3
-
4
- DEFINITIONS_FILE = os.getenv("DOCUMENT_DEFINITION_CONFIG", "example-conf.json")
5
-
6
-
7
- class DocumentFieldDefinition:
8
- """
9
- Represents the definition of a document field.
10
-
11
- Attributes:
12
- field_name (str): The name of the field.
13
- required (bool): Indicates if the field is required. Default is False.
14
- """
15
- def __init__(self, field_name: str, required: bool = False):
16
- self.field_name = field_name
17
- self.required = required
18
-
19
-
20
- class DocumentDefinitions:
21
- """
22
- Represents the definitions for a document.
23
-
24
- Attributes:
25
- saved_fields (dict[str, DocumentFieldDefinition]): A dictionary of saved fields.
26
- models (dict[str, str]): A dictionary of models.
27
- identifier (str): The identifier field.
28
- field_for_llm (str, optional): The field for LLM. Default is None.
29
- """
30
- def __init__(self, saved_fields: dict[str, DocumentFieldDefinition], models: dict[str, str],
31
- identifier: str, field_for_llm: str = None):
32
- self.saved_fields = saved_fields
33
- self.models = models
34
- self.identifier = identifier
35
- self.field_for_llm = field_for_llm
36
-
37
-
38
- def initialize_definitions():
39
- """
40
- Initializes the document definitions by reading the configuration file.
41
-
42
- Raises:
43
- ValueError: If the identifier field is not one of the saved fields or if any model field is not one of the saved fields.
44
-
45
- Returns:
46
- DocumentDefinitions: The initialized document definitions.
47
- """
48
- with open(DEFINITIONS_FILE, 'r', encoding='utf-8') as f:
49
- definitions = json.load(f)
50
-
51
- saved_fields = definitions['saved_fields']
52
- models = definitions['models']
53
- identifier_field = definitions['identifier_field']
54
- field_for_llm = definitions.get('field_for_llm', None)
55
- if identifier_field not in saved_fields.keys():
56
- raise ValueError("identifier_field must be one of the saved fields, check the configuration file")
57
-
58
- for embedded_field in models.values():
59
- if embedded_field not in saved_fields.keys():
60
- raise ValueError(f"{embedded_field} must be one of the saved fields {saved_fields.keys()}, check the configuration file")
61
-
62
- return DocumentDefinitions(saved_fields, models, identifier_field, field_for_llm)
63
-
64
-
65
- definitions_singleton = None
66
-
67
-
68
- def factory():
69
- """
70
- Factory method to get the singleton instance of DocumentDefinitions.
71
-
72
- Returns:
73
- DocumentDefinitions: The singleton instance of document definitions.
74
- """
75
- global definitions_singleton
76
- if definitions_singleton is None:
77
- definitions_singleton = initialize_definitions()
78
- return definitions_singleton
79
-
80
-
81
- class Document:
82
- """
83
- Represents a document.
84
-
85
- Attributes:
86
- page_id (str): The ID of the page.
87
- fields (dict): The fields of the document.
88
-
89
- Raises:
90
- ValueError: If the fields do not match the required fields or if a required field is missing.
91
- """
92
- def __init__(self, page_id: str, fields: dict):
93
- definitions = factory()
94
- self.page_id = page_id
95
- if fields.keys() != definitions.saved_fields.keys():
96
- raise ValueError("fields do not match the required fields")
97
- for defined_field in definitions.saved_fields.values():
98
- if defined_field.required and defined_field.field_name not in fields:
99
- raise ValueError(f"field {defined_field.field_name} is required")
100
- if defined_field.field_name in fields:
101
- setattr(self, defined_field.field_name, fields[defined_field.field_name])
1
+ import json
2
+ import os
3
+
4
+ DEFINITIONS_FILE = os.getenv("DOCUMENT_DEFINITION_CONFIG", "example-conf.json")
5
+
6
+
7
+ class DocumentFieldDefinition:
8
+ """
9
+ Represents the definition of a document field.
10
+
11
+ Attributes:
12
+ field_name (str): The name of the field.
13
+ required (bool): Indicates if the field is required. Default is False.
14
+ """
15
+ def __init__(self, field_name: str, required: bool = False):
16
+ self.field_name = field_name
17
+ self.required = required
18
+
19
+
20
+ class DocumentDefinitions:
21
+ """
22
+ Represents the definitions for a document.
23
+
24
+ Attributes:
25
+ saved_fields (dict[str, DocumentFieldDefinition]): A dictionary of saved fields.
26
+ models (dict[str, str]): A dictionary of models.
27
+ identifier (str): The identifier field.
28
+ field_for_llm (str, optional): The field for LLM. Default is None.
29
+ """
30
+ def __init__(self, saved_fields: dict[str, DocumentFieldDefinition], models: dict[str, str],
31
+ identifier: str, field_for_llm: str = None):
32
+ self.saved_fields = saved_fields
33
+ self.models = models
34
+ self.identifier = identifier
35
+ self.field_for_llm = field_for_llm
36
+
37
+
38
+ def initialize_definitions():
39
+ """
40
+ Initializes the document definitions by reading the configuration file.
41
+
42
+ Raises:
43
+ ValueError: If the identifier field is not one of the saved fields or if any model field is not one of the saved fields.
44
+
45
+ Returns:
46
+ DocumentDefinitions: The initialized document definitions.
47
+ """
48
+ with open(DEFINITIONS_FILE, 'r', encoding='utf-8') as f:
49
+ definitions = json.load(f)
50
+
51
+ saved_fields = definitions['saved_fields']
52
+ models = definitions['models']
53
+ identifier_field = definitions['identifier_field']
54
+ field_for_llm = definitions.get('field_for_llm', None)
55
+ if identifier_field not in saved_fields.keys():
56
+ raise ValueError("identifier_field must be one of the saved fields, check the configuration file")
57
+
58
+ for embedded_field in models.values():
59
+ if embedded_field not in saved_fields.keys():
60
+ raise ValueError(f"{embedded_field} must be one of the saved fields {saved_fields.keys()}, check the configuration file")
61
+
62
+ return DocumentDefinitions(saved_fields, models, identifier_field, field_for_llm)
63
+
64
+
65
+ definitions_singleton = None
66
+
67
+
68
+ def factory():
69
+ """
70
+ Factory method to get the singleton instance of DocumentDefinitions.
71
+
72
+ Returns:
73
+ DocumentDefinitions: The singleton instance of document definitions.
74
+ """
75
+ global definitions_singleton
76
+ if definitions_singleton is None:
77
+ definitions_singleton = initialize_definitions()
78
+ return definitions_singleton
79
+
80
+
81
+ class Document:
82
+ """
83
+ Represents a document.
84
+
85
+ Attributes:
86
+ page_id (str): The ID of the page.
87
+ fields (dict): The fields of the document.
88
+
89
+ Raises:
90
+ ValueError: If the fields do not match the required fields or if a required field is missing.
91
+ """
92
+ def __init__(self, page_id: str, fields: dict):
93
+ definitions = factory()
94
+ self.page_id = page_id
95
+ if fields.keys() != definitions.saved_fields.keys():
96
+ raise ValueError("fields do not match the required fields")
97
+ for defined_field in definitions.saved_fields.values():
98
+ if defined_field.required and defined_field.field_name not in fields:
99
+ raise ValueError(f"field {defined_field.field_name} is required")
100
+ if defined_field.field_name in fields:
101
+ setattr(self, defined_field.field_name, fields[defined_field.field_name])
@@ -1,4 +1,4 @@
1
- class IntegrateService:
2
-
3
- def on_update_docs(self, _docs):
4
- raise NotImplementedError
1
+ class IntegrateService:
2
+
3
+ def on_update_docs(self, _docs):
4
+ raise NotImplementedError
File without changes
@@ -1,5 +1,5 @@
1
- import os
2
-
3
- EMBEDDING_INDEX = os.getenv("ES_EMBEDDING_INDEX", "embeddings")
4
- MODELS_LOCATION = os.getenv("MODELS_LOCATION", "models")
5
-
1
+ import os
2
+
3
+ EMBEDDING_INDEX = os.getenv("ES_EMBEDDING_INDEX", "embeddings")
4
+ MODELS_LOCATION = os.getenv("MODELS_LOCATION", "models")
5
+