core-semantic-search-app 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- core_semantic_search_app-1.0.0/LICENSE.md +35 -0
- core_semantic_search_app-1.0.0/MANIFEST.in +5 -0
- core_semantic_search_app-1.0.0/PKG-INFO +36 -0
- core_semantic_search_app-1.0.0/README.rst +26 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/__init__.py +4 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/admin.py +15 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/apps.py +54 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/commons/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/commons/exceptions.py +9 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/data/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/data/watch.py +30 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/document/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/document/api.py +279 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/document/models.py +22 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/model_settings/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/model_settings/admin_site.py +49 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/model_settings/api.py +23 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/model_settings/forms.py +44 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/components/model_settings/models.py +95 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/menus.py +14 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/migrations/0001_initial.py +34 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/migrations/0002_modelsettings.py +75 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/migrations/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/rest/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/rest/urls.py +14 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/rest/views.py +141 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/settings.py +23 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/static/core_semantic_search_app/css/search_box.css +41 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/static/core_semantic_search_app/js/search_box.js +229 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/static/core_semantic_search_app/js/search_box.raw.js +1 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/tasks.py +18 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/templates/core_semantic_search_app/user/index.html +45 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/templates/core_semantic_search_app/user/modals/settings.html +53 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/urls.py +19 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/utils/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/utils/chunking_utils.py +201 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/utils/model_utils/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/utils/model_utils/model_api.py +112 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/utils/model_utils/model_client.py +60 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/utils/model_utils/response.py +62 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/views/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/views/user/__init__.py +0 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app/views/user/views.py +59 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app.egg-info/PKG-INFO +36 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app.egg-info/SOURCES.txt +74 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app.egg-info/dependency_links.txt +1 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app.egg-info/requires.txt +7 -0
- core_semantic_search_app-1.0.0/core_semantic_search_app.egg-info/top_level.txt +2 -0
- core_semantic_search_app-1.0.0/pyproject.toml +7 -0
- core_semantic_search_app-1.0.0/requirements.core.txt +1 -0
- core_semantic_search_app-1.0.0/requirements.txt +3 -0
- core_semantic_search_app-1.0.0/setup.cfg +4 -0
- core_semantic_search_app-1.0.0/setup.py +71 -0
- core_semantic_search_app-1.0.0/tests/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/components/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/components/data/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/components/data/tests_unit.py +44 -0
- core_semantic_search_app-1.0.0/tests/components/document/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/components/document/tests_unit.py +564 -0
- core_semantic_search_app-1.0.0/tests/components/document/tests_unit_tasks.py +30 -0
- core_semantic_search_app-1.0.0/tests/components/model_settings/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/components/model_settings/tests_unit.py +234 -0
- core_semantic_search_app-1.0.0/tests/rest/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/rest/tests_permissions.py +77 -0
- core_semantic_search_app-1.0.0/tests/rest/tests_unit.py +213 -0
- core_semantic_search_app-1.0.0/tests/test_settings.py +69 -0
- core_semantic_search_app-1.0.0/tests/tests_unit_apps.py +42 -0
- core_semantic_search_app-1.0.0/tests/urls.py +9 -0
- core_semantic_search_app-1.0.0/tests/utils/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/utils/tests_unit.py +532 -0
- core_semantic_search_app-1.0.0/tests/utils/tests_unit_model_api.py +134 -0
- core_semantic_search_app-1.0.0/tests/utils/tests_unit_model_client.py +92 -0
- core_semantic_search_app-1.0.0/tests/views/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/views/user/__init__.py +0 -0
- core_semantic_search_app-1.0.0/tests/views/user/tests_unit.py +64 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# NIST Software Licensing Statement
|
|
2
|
+
|
|
3
|
+
NIST-developed software is provided by NIST as a public service.
|
|
4
|
+
You may use, copy, and distribute copies of the software in any
|
|
5
|
+
medium, provided that you keep intact this entire notice. You may
|
|
6
|
+
improve, modify, and create derivative works of the software or
|
|
7
|
+
any portion of the software, and you may copy and distribute such
|
|
8
|
+
modifications or works. Modified works should carry a notice
|
|
9
|
+
stating that you changed the software and should note the date
|
|
10
|
+
and nature of any such change. Please explicitly acknowledge the
|
|
11
|
+
National Institute of Standards and Technology as the source of
|
|
12
|
+
the software.
|
|
13
|
+
|
|
14
|
+
NIST-developed software is expressly provided "AS IS." NIST MAKES
|
|
15
|
+
NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY
|
|
16
|
+
OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
|
|
17
|
+
WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,
|
|
18
|
+
NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR
|
|
19
|
+
WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED
|
|
20
|
+
OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES
|
|
21
|
+
NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE
|
|
22
|
+
SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE
|
|
23
|
+
CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE
|
|
24
|
+
SOFTWARE.
|
|
25
|
+
|
|
26
|
+
You are solely responsible for determining the appropriateness of
|
|
27
|
+
using and distributing the software and you assume all risks
|
|
28
|
+
associated with its use, including but not limited to the risks
|
|
29
|
+
and costs of program errors, compliance with applicable laws,
|
|
30
|
+
damage to or loss of data, programs or equipment, and the
|
|
31
|
+
unavailability or interruption of operation. This software is not
|
|
32
|
+
intended to be used in any situation where a failure could cause
|
|
33
|
+
risk of injury or damage to property. The software developed by
|
|
34
|
+
NIST employees is not subject to copyright protection within the
|
|
35
|
+
United States.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: core_semantic_search_app
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Semantic Search utils for the curator core project
|
|
5
|
+
Home-page: https://github.com/usnistgov/core_semantic_search_app
|
|
6
|
+
Author: NIST IT Lab
|
|
7
|
+
Author-email: itl_inquiries@nist.gov
|
|
8
|
+
Provides-Extra: sentence_transformers
|
|
9
|
+
License-File: LICENSE.md
|
|
10
|
+
|
|
11
|
+
========================
|
|
12
|
+
Core Semantic Search App
|
|
13
|
+
========================
|
|
14
|
+
|
|
15
|
+
Semantic search for the curator core project.
|
|
16
|
+
|
|
17
|
+
Quick start
|
|
18
|
+
===========
|
|
19
|
+
|
|
20
|
+
1. Add "core_semantic_search_app" to your INSTALLED_APPS setting
|
|
21
|
+
----------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
.. code:: python
|
|
24
|
+
|
|
25
|
+
INSTALLED_APPS = [
|
|
26
|
+
...
|
|
27
|
+
'core_semantic_search_app',
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
2. Include the core_semantic_search_app URLconf in your project urls.py
|
|
32
|
+
-----------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
.. code:: python
|
|
35
|
+
|
|
36
|
+
re_path(r'^semantic-search/', include('core_semantic_search_app.urls')),
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
========================
|
|
2
|
+
Core Semantic Search App
|
|
3
|
+
========================
|
|
4
|
+
|
|
5
|
+
Semantic search for the curator core project.
|
|
6
|
+
|
|
7
|
+
Quick start
|
|
8
|
+
===========
|
|
9
|
+
|
|
10
|
+
1. Add "core_semantic_search_app" to your INSTALLED_APPS setting
|
|
11
|
+
----------------------------------------------------------------
|
|
12
|
+
|
|
13
|
+
.. code:: python
|
|
14
|
+
|
|
15
|
+
INSTALLED_APPS = [
|
|
16
|
+
...
|
|
17
|
+
'core_semantic_search_app',
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
2. Include the core_semantic_search_app URLconf in your project urls.py
|
|
22
|
+
-----------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
.. code:: python
|
|
25
|
+
|
|
26
|
+
re_path(r'^semantic-search/', include('core_semantic_search_app.urls')),
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
""" Url router for the administration site
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
from django.contrib import admin
|
|
5
|
+
|
|
6
|
+
from core_semantic_search_app.components.document.models import Document
|
|
7
|
+
from core_semantic_search_app.components.model_settings.admin_site import (
|
|
8
|
+
CustomModelSettingsAdmin,
|
|
9
|
+
)
|
|
10
|
+
from core_semantic_search_app.components.model_settings.models import (
|
|
11
|
+
ModelSettings,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
admin.site.register(Document)
|
|
15
|
+
admin.site.register(ModelSettings, CustomModelSettingsAdmin)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
""" Apps file for core_semantic_search_app
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from django.apps import AppConfig
|
|
7
|
+
from django.db.models.signals import post_save, post_delete
|
|
8
|
+
|
|
9
|
+
from core_main_app.utils.databases.backend import uses_postgresql_backend
|
|
10
|
+
from core_semantic_search_app.commons.exceptions import SemanticSearchError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SemanticSearchAppConfig(AppConfig):
|
|
14
|
+
"""Core application settings"""
|
|
15
|
+
|
|
16
|
+
name = "core_semantic_search_app"
|
|
17
|
+
|
|
18
|
+
def ready(self):
|
|
19
|
+
"""Run when the app is ready
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
|
|
23
|
+
"""
|
|
24
|
+
if "migrate" in sys.argv:
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
_check_settings()
|
|
28
|
+
_init_signals()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _check_settings():
|
|
32
|
+
"""Check settings
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
|
|
36
|
+
"""
|
|
37
|
+
if not uses_postgresql_backend():
|
|
38
|
+
raise SemanticSearchError("PostgreSQL with Pgvector is required.")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _init_signals():
|
|
42
|
+
"""Init Signals
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
from core_main_app.components.data.models import Data
|
|
48
|
+
from core_semantic_search_app.components.data.watch import (
|
|
49
|
+
post_save_data,
|
|
50
|
+
post_delete_data,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
post_save.connect(post_save_data, sender=Data)
|
|
54
|
+
post_delete.connect(post_delete_data, sender=Data)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
""" Watchers for the data collection
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
from core_semantic_search_app.components.document.api import (
|
|
5
|
+
index_documents_from_data,
|
|
6
|
+
delete_documents_with_data_id,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def post_save_data(sender, instance, **kwargs):
|
|
11
|
+
"""Method executed after saving a Data object.
|
|
12
|
+
Args:
|
|
13
|
+
sender: Class.
|
|
14
|
+
instance: Data object.
|
|
15
|
+
**kwargs: Args.
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
index_documents_from_data(data=instance)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def post_delete_data(sender, instance, **kwargs):
|
|
23
|
+
"""Method executed after deleting a Data object.
|
|
24
|
+
Args:
|
|
25
|
+
sender: Class.
|
|
26
|
+
instance: Data object.
|
|
27
|
+
**kwargs: Args.
|
|
28
|
+
|
|
29
|
+
"""
|
|
30
|
+
delete_documents_with_data_id(instance.id)
|
|
File without changes
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
""" Document API
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from django.db.models import OuterRef, Exists, IntegerField
|
|
8
|
+
from django.db.models.fields.json import KeyTextTransform
|
|
9
|
+
from django.db.models.functions import Cast
|
|
10
|
+
from pgvector.django import CosineDistance, L2Distance, MaxInnerProduct
|
|
11
|
+
|
|
12
|
+
from core_main_app.access_control.api import has_perm_administration
|
|
13
|
+
from core_main_app.access_control.decorators import access_control
|
|
14
|
+
from core_main_app.commons.exceptions import ApiError
|
|
15
|
+
from core_main_app.components.data import api as data_api
|
|
16
|
+
from core_main_app.components.workspace import api as workspace_api
|
|
17
|
+
from core_semantic_search_app import tasks as semantic_search_tasks
|
|
18
|
+
from core_semantic_search_app.components.document.models import Document
|
|
19
|
+
from core_semantic_search_app.components.model_settings.models import (
|
|
20
|
+
ModelSettings,
|
|
21
|
+
)
|
|
22
|
+
from core_semantic_search_app.settings import (
|
|
23
|
+
INSTALLED_APPS,
|
|
24
|
+
)
|
|
25
|
+
from core_semantic_search_app.utils.chunking_utils import chunk_json_dict
|
|
26
|
+
|
|
27
|
+
if "core_linked_records_app" in INSTALLED_APPS:
|
|
28
|
+
from core_linked_records_app.system.pid_path import (
|
|
29
|
+
api as system_pid_path_api,
|
|
30
|
+
)
|
|
31
|
+
from core_linked_records_app.utils.dict import (
|
|
32
|
+
is_dot_notation_in_dictionary,
|
|
33
|
+
get_value_from_dot_notation,
|
|
34
|
+
)
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
VECTOR_FUNCTIONS = {
|
|
38
|
+
"cosine_similarity": CosineDistance,
|
|
39
|
+
"l2_distance": L2Distance,
|
|
40
|
+
"max_inner_product": MaxInnerProduct,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# NOTE: called from task
|
|
45
|
+
def generate_documents_from_data(data):
|
|
46
|
+
"""Convert data to documents
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
data:
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
|
|
53
|
+
"""
|
|
54
|
+
data_pid = None
|
|
55
|
+
if "core_linked_records_app" in INSTALLED_APPS:
|
|
56
|
+
try:
|
|
57
|
+
pid_path = system_pid_path_api.get_pid_path_by_template(
|
|
58
|
+
data.template,
|
|
59
|
+
).path
|
|
60
|
+
|
|
61
|
+
# If the pid_path does not exist in the document, exit early and return None
|
|
62
|
+
data_dict = data.get_dict_content()
|
|
63
|
+
if is_dot_notation_in_dictionary(data_dict, pid_path):
|
|
64
|
+
data_pid = get_value_from_dot_notation(data_dict, pid_path)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.error(
|
|
67
|
+
f"Unable to get data PID during data to document conversion: {str(e)}"
|
|
68
|
+
)
|
|
69
|
+
model_settings = ModelSettings.get()
|
|
70
|
+
|
|
71
|
+
target_keys = (
|
|
72
|
+
model_settings.document_index_fields
|
|
73
|
+
if model_settings.document_index_strategy == "VALUES"
|
|
74
|
+
else None
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
text_chunks = chunk_json_dict(
|
|
78
|
+
json_dict=data.get_dict_content(),
|
|
79
|
+
chunk_size=model_settings.sliding_window_chunk_length,
|
|
80
|
+
chunk_overlap=model_settings.sliding_window_chunk_overlap,
|
|
81
|
+
target_keys=target_keys,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
documents = list()
|
|
85
|
+
for text_chunk in text_chunks:
|
|
86
|
+
document = Document(
|
|
87
|
+
content=text_chunk,
|
|
88
|
+
meta={
|
|
89
|
+
"title": data.title,
|
|
90
|
+
"data_id": data.id,
|
|
91
|
+
"data_pid": data_pid,
|
|
92
|
+
},
|
|
93
|
+
)
|
|
94
|
+
documents.append(document)
|
|
95
|
+
return documents
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# NOTE: called from data watch
|
|
99
|
+
def delete_documents_with_data_id(data_id):
|
|
100
|
+
"""Delete documents with given data id
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
data_id:
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
|
|
107
|
+
"""
|
|
108
|
+
# Get any existing documents for this data
|
|
109
|
+
documents = _get_documents_by_data_id(data_id=data_id)
|
|
110
|
+
# Delete documents
|
|
111
|
+
documents.delete()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# NOTE: called from data watch
|
|
115
|
+
def index_documents_from_data(data):
|
|
116
|
+
"""Index documents extracted from a data
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
data:
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
|
|
123
|
+
"""
|
|
124
|
+
model_settings = ModelSettings.get()
|
|
125
|
+
|
|
126
|
+
if not model_settings or not model_settings.embedding_models.keys():
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
# Delete previous documents
|
|
130
|
+
delete_documents_with_data_id(data.id)
|
|
131
|
+
|
|
132
|
+
if not _check_template_name(
|
|
133
|
+
data, model_settings.document_index_template_name_filter
|
|
134
|
+
):
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
# only deal with data in a workspace
|
|
138
|
+
if not data.workspace:
|
|
139
|
+
return
|
|
140
|
+
|
|
141
|
+
# only deal with public data
|
|
142
|
+
public_workspaces = workspace_api.get_all_public_workspaces().values_list(
|
|
143
|
+
"id", flat=True
|
|
144
|
+
)
|
|
145
|
+
if data.workspace.id not in public_workspaces:
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
# Start indexing of documents
|
|
149
|
+
semantic_search_tasks.write_documents.apply_async((data.id,))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@access_control(has_perm_administration)
|
|
153
|
+
def reindex(user):
|
|
154
|
+
"""Reindex the full knowledge base
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
|
|
158
|
+
"""
|
|
159
|
+
# Get all currently indexed documents
|
|
160
|
+
all_docs = Document.objects.all()
|
|
161
|
+
# Delete them
|
|
162
|
+
all_docs.delete()
|
|
163
|
+
|
|
164
|
+
# Find all public workspaces
|
|
165
|
+
public_workspaces = workspace_api.get_all_public_workspaces().values_list(
|
|
166
|
+
"id", flat=True
|
|
167
|
+
)
|
|
168
|
+
# Get all public data
|
|
169
|
+
all_data = data_api.get_all(user).filter(workspace__in=public_workspaces)
|
|
170
|
+
|
|
171
|
+
# Get model settings from database
|
|
172
|
+
model_settings = ModelSettings.get()
|
|
173
|
+
# Go through all records
|
|
174
|
+
for data in all_data:
|
|
175
|
+
# Check if template name matches pattern
|
|
176
|
+
if not _check_template_name(
|
|
177
|
+
data, model_settings.document_index_template_name_filter
|
|
178
|
+
):
|
|
179
|
+
continue
|
|
180
|
+
# Index documents in tasks
|
|
181
|
+
semantic_search_tasks.write_documents.apply_async((data.id,))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def query(
|
|
185
|
+
query_embedding=None,
|
|
186
|
+
top_k=10,
|
|
187
|
+
threshold=0.8,
|
|
188
|
+
vector_function="cosine_similarity",
|
|
189
|
+
data_filters_qs=None,
|
|
190
|
+
):
|
|
191
|
+
"""Query the embedding
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
query_embedding:
|
|
195
|
+
top_k:
|
|
196
|
+
threshold:
|
|
197
|
+
data_filters_qs:
|
|
198
|
+
vector_function:
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
|
|
202
|
+
"""
|
|
203
|
+
if vector_function not in list(VECTOR_FUNCTIONS.keys()):
|
|
204
|
+
raise ApiError(
|
|
205
|
+
f"Vector function should be in: {list(VECTOR_FUNCTIONS.keys())}."
|
|
206
|
+
)
|
|
207
|
+
# Get all documents
|
|
208
|
+
queryset = Document.objects.all()
|
|
209
|
+
|
|
210
|
+
if data_filters_qs is not None:
|
|
211
|
+
# Pull data_id from meta field
|
|
212
|
+
data_id_expr = Cast(
|
|
213
|
+
KeyTextTransform("data_id", "meta"),
|
|
214
|
+
output_field=IntegerField(),
|
|
215
|
+
)
|
|
216
|
+
# Filter on extracted data_id
|
|
217
|
+
queryset = queryset.annotate(_data_id=data_id_expr).filter(
|
|
218
|
+
Exists(data_filters_qs.filter(id=OuterRef("_data_id")))
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if query_embedding:
|
|
222
|
+
# Order documents by distance to query
|
|
223
|
+
# https://github.com/pgvector/pgvector-python?tab=readme-ov-file#django
|
|
224
|
+
queryset = queryset.order_by(
|
|
225
|
+
VECTOR_FUNCTIONS[vector_function]("embedding", query_embedding)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Compute the score between query and documents
|
|
229
|
+
# https://github.com/pgvector/pgvector?tab=readme-ov-file#distances
|
|
230
|
+
if vector_function == "cosine_similarity":
|
|
231
|
+
queryset = queryset.annotate(
|
|
232
|
+
score=1 - CosineDistance("embedding", query_embedding)
|
|
233
|
+
)
|
|
234
|
+
elif vector_function == "max_inner_product":
|
|
235
|
+
queryset = queryset.annotate(
|
|
236
|
+
score=-1 * MaxInnerProduct("embedding", query_embedding)
|
|
237
|
+
)
|
|
238
|
+
elif vector_function == "l2_distance":
|
|
239
|
+
queryset = queryset.annotate(
|
|
240
|
+
score=L2Distance("embedding", query_embedding)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# filter score with provided threshold
|
|
244
|
+
if threshold:
|
|
245
|
+
if vector_function == "l2_distance":
|
|
246
|
+
queryset = queryset.filter(score__lt=threshold)
|
|
247
|
+
else:
|
|
248
|
+
queryset = queryset.filter(score__gt=threshold)
|
|
249
|
+
|
|
250
|
+
# Keep the top k results
|
|
251
|
+
if top_k:
|
|
252
|
+
queryset = queryset.all()[:top_k]
|
|
253
|
+
|
|
254
|
+
return queryset
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _check_template_name(data, pattern):
|
|
258
|
+
"""Check template name matches the pattern
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
data:
|
|
262
|
+
pattern:
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
|
|
266
|
+
"""
|
|
267
|
+
return re.search(pattern, data.template.version_manager.title)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _get_documents_by_data_id(data_id):
|
|
271
|
+
"""Get documents with data id
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
data_id:
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
|
|
278
|
+
"""
|
|
279
|
+
return Document.objects.filter(meta__data_id=int(data_id))
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
""" Document models
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
from django.db import models
|
|
5
|
+
from django.db.models import JSONField
|
|
6
|
+
from pgvector.django import VectorField
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Document(models.Model):
|
|
10
|
+
"""Document Model"""
|
|
11
|
+
|
|
12
|
+
embedding = VectorField()
|
|
13
|
+
content = models.CharField(unique=False, max_length=4000)
|
|
14
|
+
meta = JSONField(default=dict)
|
|
15
|
+
|
|
16
|
+
def __str__(self):
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
return self.meta.get("title", "Untitled")
|
|
File without changes
|
core_semantic_search_app-1.0.0/core_semantic_search_app/components/model_settings/admin_site.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
""" Custom admin site for the Model Settings model
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
from django.contrib import admin
|
|
5
|
+
from django.contrib import messages
|
|
6
|
+
|
|
7
|
+
from core_semantic_search_app.components.document.api import reindex
|
|
8
|
+
from core_semantic_search_app.components.model_settings.forms import (
|
|
9
|
+
ModelSettingsAdminForm,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@admin.action(description="Reindex the knowledge base")
|
|
14
|
+
def reindex_action(model_admin, request, queryset):
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
model_admin:
|
|
19
|
+
request:
|
|
20
|
+
queryset:
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
if not request.user.is_superuser:
|
|
26
|
+
model_admin.message_user(request, "Permission denied.", messages.ERROR)
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
reindex(request.user)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CustomModelSettingsAdmin(admin.ModelAdmin):
|
|
33
|
+
"""CustomModelSettingsAdmin"""
|
|
34
|
+
|
|
35
|
+
form = ModelSettingsAdminForm
|
|
36
|
+
actions = [reindex_action]
|
|
37
|
+
|
|
38
|
+
def has_add_permission(self, request):
|
|
39
|
+
"""Has add permission - only if doesn't exist
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
request:
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
if self.model.objects.exists():
|
|
48
|
+
return False
|
|
49
|
+
return super().has_add_permission(request)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
""" ModelSetting api
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_api_key(model_dict):
|
|
8
|
+
"""Get API Key from dictionary
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
model_dict:
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
# Check if api_key_env is set
|
|
17
|
+
api_key_env = model_dict.get("api_key_env")
|
|
18
|
+
# Get value of API Key from env
|
|
19
|
+
api_key = os.getenv(api_key_env) if api_key_env else None
|
|
20
|
+
# Get value from dict otherwise
|
|
21
|
+
api_key = model_dict.get("api_key", "no-key") if not api_key else api_key
|
|
22
|
+
# Return API Key
|
|
23
|
+
return api_key
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
""" Model Settings forms
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
from django import forms
|
|
7
|
+
|
|
8
|
+
from core_semantic_search_app.components.model_settings.models import (
|
|
9
|
+
ModelSettings,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PrettyJSONEncoder(json.JSONEncoder):
|
|
14
|
+
def __init__(self, *args, indent, sort_keys, **kwargs):
|
|
15
|
+
super().__init__(*args, indent=4, sort_keys=False, **kwargs)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ModelSettingsAdminForm(forms.ModelForm):
|
|
19
|
+
|
|
20
|
+
class Meta:
|
|
21
|
+
model = ModelSettings
|
|
22
|
+
fields = "__all__"
|
|
23
|
+
labels = {
|
|
24
|
+
"sliding_window_chunk_length": "Chunk size (characters)",
|
|
25
|
+
"sliding_window_chunk_overlap": "Chunk overlap (characters)",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
def __init__(self, *args, **kwargs):
|
|
29
|
+
super().__init__(*args, **kwargs)
|
|
30
|
+
self.fields["embedding_models"].encoder = PrettyJSONEncoder
|
|
31
|
+
self.fields["embedding_models"].widget.attrs.update(
|
|
32
|
+
{
|
|
33
|
+
"placeholder": """{"modelName":{
|
|
34
|
+
"model": "modelName:version",
|
|
35
|
+
"base_url": "http://localhost:8080/v1",
|
|
36
|
+
"api_key": "",
|
|
37
|
+
"api_key_env": "",
|
|
38
|
+
"ssl_verify": true,
|
|
39
|
+
"proxies": {}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
"""
|
|
43
|
+
}
|
|
44
|
+
)
|