scout 3.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scout/__init__.py +1 -0
- scout/__main__.py +5 -0
- scout/client.py +269 -0
- scout/constants.py +7 -0
- scout/exceptions.py +21 -0
- scout/models.py +223 -0
- scout/search.py +135 -0
- scout/serializers.py +100 -0
- scout/server.py +217 -0
- scout/tests.py +991 -0
- scout/validator.py +83 -0
- scout/views.py +472 -0
- scout-3.0.4.dist-info/METADATA +140 -0
- scout-3.0.4.dist-info/RECORD +18 -0
- scout-3.0.4.dist-info/WHEEL +5 -0
- scout-3.0.4.dist-info/entry_points.txt +2 -0
- scout-3.0.4.dist-info/licenses/LICENSE +19 -0
- scout-3.0.4.dist-info/top_level.txt +2 -0
scout/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '3.0.4'
|
scout/__main__.py
ADDED
scout/client.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
try:
|
|
4
|
+
from email.generator import _make_boundary as choose_boundary
|
|
5
|
+
except ImportError:
|
|
6
|
+
from mimetools import choose_boundary
|
|
7
|
+
import mimetypes
|
|
8
|
+
import os
|
|
9
|
+
try:
|
|
10
|
+
from urllib.parse import urlencode
|
|
11
|
+
except ImportError:
|
|
12
|
+
from urllib import urlencode
|
|
13
|
+
try:
|
|
14
|
+
from urllib.request import Request
|
|
15
|
+
from urllib.request import urlopen
|
|
16
|
+
except ImportError:
|
|
17
|
+
from urllib2 import Request
|
|
18
|
+
from urllib2 import urlopen
|
|
19
|
+
import zlib
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
ENDPOINT = None
|
|
23
|
+
KEY = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Scout(object):
|
|
27
|
+
def __init__(self, endpoint=ENDPOINT, key=KEY):
|
|
28
|
+
self.endpoint = endpoint.rstrip('/')
|
|
29
|
+
self.key = key
|
|
30
|
+
|
|
31
|
+
def get_full_url(self, url):
|
|
32
|
+
return self.endpoint + url
|
|
33
|
+
|
|
34
|
+
def get_raw(self, url, **kwargs):
|
|
35
|
+
headers = {'Content-Type': 'application/json'}
|
|
36
|
+
if self.key:
|
|
37
|
+
headers['key'] = self.key
|
|
38
|
+
if kwargs:
|
|
39
|
+
if '?' not in url:
|
|
40
|
+
url += '?'
|
|
41
|
+
url += urlencode(kwargs, True)
|
|
42
|
+
request = Request(self.get_full_url(url), headers=headers)
|
|
43
|
+
fh = urlopen(request)
|
|
44
|
+
return fh.read()
|
|
45
|
+
|
|
46
|
+
def get(self, url, **kwargs):
|
|
47
|
+
return json.loads(self.get_raw(url, **kwargs))
|
|
48
|
+
|
|
49
|
+
def post(self, url, data=None, files=None):
|
|
50
|
+
if files:
|
|
51
|
+
return self.post_files(url, data, files)
|
|
52
|
+
else:
|
|
53
|
+
return self.post_json(url, data)
|
|
54
|
+
|
|
55
|
+
def post_json(self, url, data=None):
|
|
56
|
+
headers = {'Content-Type': 'application/json'}
|
|
57
|
+
if self.key:
|
|
58
|
+
headers['key'] = self.key
|
|
59
|
+
data = json.dumps(data or {})
|
|
60
|
+
if not isinstance(data, bytes):
|
|
61
|
+
data = data.encode('utf-8')
|
|
62
|
+
request = Request(self.get_full_url(url), data=data, headers=headers)
|
|
63
|
+
return json.loads(urlopen(request).read().decode('utf8'))
|
|
64
|
+
|
|
65
|
+
def post_files(self, url, json_data, files=None):
|
|
66
|
+
if not files or not isinstance(files, dict):
|
|
67
|
+
raise ValueError('One or more files is required. Files should be '
|
|
68
|
+
'passed as a dictionary of filename: file-like-'
|
|
69
|
+
'object.')
|
|
70
|
+
boundary = choose_boundary()
|
|
71
|
+
form_files = []
|
|
72
|
+
for i, (filename, file_obj) in enumerate(files.items()):
|
|
73
|
+
try:
|
|
74
|
+
data = file_obj.read()
|
|
75
|
+
except AttributeError:
|
|
76
|
+
data = bytes(file_obj)
|
|
77
|
+
mimetype = mimetypes.guess_type(filename)[0]
|
|
78
|
+
form_files.append((
|
|
79
|
+
'file_%s' % i,
|
|
80
|
+
filename,
|
|
81
|
+
mimetype or 'application/octet-stream',
|
|
82
|
+
data))
|
|
83
|
+
|
|
84
|
+
part_boundary = '--' + boundary
|
|
85
|
+
parts = [
|
|
86
|
+
part_boundary,
|
|
87
|
+
'Content-Disposition: form-data; name="data"',
|
|
88
|
+
'',
|
|
89
|
+
json.dumps(json_data)]
|
|
90
|
+
for field_name, filename, mimetype, data in form_files:
|
|
91
|
+
parts.extend((
|
|
92
|
+
part_boundary,
|
|
93
|
+
'Content-Disposition: file; name="%s"; filename="%s"' % (
|
|
94
|
+
field_name, filename),
|
|
95
|
+
'Content-Type: %s' % mimetype,
|
|
96
|
+
'',
|
|
97
|
+
data))
|
|
98
|
+
parts.append('--' + boundary + '--')
|
|
99
|
+
parts.append('')
|
|
100
|
+
|
|
101
|
+
headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
|
|
102
|
+
boundary}
|
|
103
|
+
if self.key:
|
|
104
|
+
headers['key'] = self.key
|
|
105
|
+
|
|
106
|
+
data = '\r\n'.join(parts)
|
|
107
|
+
if not isinstance(data, bytes):
|
|
108
|
+
data = data.encode('utf-8')
|
|
109
|
+
|
|
110
|
+
request = Request(self.get_full_url(url), data=data, headers=headers)
|
|
111
|
+
return json.loads(urlopen(request).read())
|
|
112
|
+
|
|
113
|
+
def delete(self, url):
|
|
114
|
+
headers = {}
|
|
115
|
+
if self.key:
|
|
116
|
+
headers['key'] = self.key
|
|
117
|
+
request = Request(self.get_full_url(url), headers=headers)
|
|
118
|
+
request.get_method = lambda: 'DELETE'
|
|
119
|
+
fh = urlopen(request)
|
|
120
|
+
return json.loads(fh.read())
|
|
121
|
+
|
|
122
|
+
def get_indexes(self, **kwargs):
|
|
123
|
+
return self.get('/', **kwargs)['indexes']
|
|
124
|
+
|
|
125
|
+
def create_index(self, name):
|
|
126
|
+
return self.post('/', {'name': name})
|
|
127
|
+
|
|
128
|
+
def rename_index(self, old_name, new_name):
|
|
129
|
+
return self.post('/%s/' % old_name, {'name': new_name})
|
|
130
|
+
|
|
131
|
+
def delete_index(self, name):
|
|
132
|
+
return self.delete('/%s/' % name)
|
|
133
|
+
|
|
134
|
+
def get_index(self, name, **kwargs):
|
|
135
|
+
return self.get('/%s/' % name, **kwargs)
|
|
136
|
+
|
|
137
|
+
def get_documents(self, **kwargs):
|
|
138
|
+
return self.get('/documents/', **kwargs)
|
|
139
|
+
|
|
140
|
+
def create_document(self, content, indexes, identifier=None,
|
|
141
|
+
attachments=None, **metadata):
|
|
142
|
+
if not isinstance(indexes, (list, tuple)):
|
|
143
|
+
indexes = [indexes]
|
|
144
|
+
post_data = {
|
|
145
|
+
'content': content,
|
|
146
|
+
'identifier': identifier,
|
|
147
|
+
'indexes': indexes,
|
|
148
|
+
'metadata': metadata}
|
|
149
|
+
return self.post('/documents/', post_data, attachments)
|
|
150
|
+
|
|
151
|
+
def update_document(self, document_id=None, content=None, indexes=None,
|
|
152
|
+
metadata=None, identifier=None, attachments=None):
|
|
153
|
+
if not document_id and not identifier:
|
|
154
|
+
raise ValueError('`document_id` must be provided.')
|
|
155
|
+
|
|
156
|
+
data = {}
|
|
157
|
+
if content is not None:
|
|
158
|
+
data['content'] = content
|
|
159
|
+
if indexes is not None:
|
|
160
|
+
if not isinstance(indexes, (list, tuple)):
|
|
161
|
+
indexes = [indexes]
|
|
162
|
+
data['indexes'] = indexes
|
|
163
|
+
if metadata is not None:
|
|
164
|
+
data['metadata'] = metadata
|
|
165
|
+
|
|
166
|
+
if not data and not attachments:
|
|
167
|
+
raise ValueError('Nothing to update.')
|
|
168
|
+
|
|
169
|
+
return self.post('/documents/%s/' % document_id, data, attachments)
|
|
170
|
+
|
|
171
|
+
def delete_document(self, document_id=None):
|
|
172
|
+
if not document_id:
|
|
173
|
+
raise ValueError('`document_id` must be provided.')
|
|
174
|
+
|
|
175
|
+
return self.delete('/documents/%s/' % document_id)
|
|
176
|
+
|
|
177
|
+
def get_document(self, document_id=None):
|
|
178
|
+
if not document_id:
|
|
179
|
+
raise ValueError('`document_id` must be provided.')
|
|
180
|
+
|
|
181
|
+
return self.get('/documents/%s/' % document_id)
|
|
182
|
+
|
|
183
|
+
def attach_files(self, document_id, attachments):
|
|
184
|
+
return self.post_files('/documents/%s/attachments/' % document_id,
|
|
185
|
+
{}, attachments)
|
|
186
|
+
|
|
187
|
+
def detach_file(self, document_id, filename):
|
|
188
|
+
return self.delete('/documents/%s/attachments/%s/' %
|
|
189
|
+
(document_id, filename))
|
|
190
|
+
|
|
191
|
+
def update_file(self, document_id, filename, file_object):
|
|
192
|
+
return self.post_files('/documents/%s/attachments/%s/' %
|
|
193
|
+
(document_id, filename),
|
|
194
|
+
{}, {filename: file_object})
|
|
195
|
+
|
|
196
|
+
def get_attachments(self, document_id, **kwargs):
|
|
197
|
+
return self.get('/documents/%s/attachments/' % document_id, **kwargs)
|
|
198
|
+
|
|
199
|
+
def get_attachment(self, document_id, filename):
|
|
200
|
+
return self.get('/documents/%s/attachments/%s/' %
|
|
201
|
+
(document_id, filename))
|
|
202
|
+
|
|
203
|
+
def download_attachment(self, document_id, filename):
|
|
204
|
+
return self.get_raw('/documents/%s/attachments/%s/download/' %
|
|
205
|
+
(document_id, filename))
|
|
206
|
+
|
|
207
|
+
def search_attachments(self, **kwargs):
|
|
208
|
+
return self.get('/documents/attachments/', **kwargs)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class SearchProvider(object):
|
|
212
|
+
def content(self, obj):
|
|
213
|
+
raise NotImplementedError
|
|
214
|
+
|
|
215
|
+
def identifier(self, obj):
|
|
216
|
+
raise NotImplementedError
|
|
217
|
+
|
|
218
|
+
def metadata(self, obj):
|
|
219
|
+
raise NotImplementedError
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class SearchSite(object):
|
|
223
|
+
def __init__(self, client, index):
|
|
224
|
+
self.client = client
|
|
225
|
+
self.index = index
|
|
226
|
+
self.registry = {}
|
|
227
|
+
|
|
228
|
+
def register(self, model_class, search_provider):
|
|
229
|
+
self.registry.setdefault(model_class, [])
|
|
230
|
+
self.registry[model_class].append(search_provider())
|
|
231
|
+
|
|
232
|
+
def unregister(self, model_class, search_provider=None):
|
|
233
|
+
if search_provider is None:
|
|
234
|
+
self.registry.pop(model_class, None)
|
|
235
|
+
elif model_class in self.registry:
|
|
236
|
+
self.registry[model_class] = [
|
|
237
|
+
sp for sp in self.registry[model_class]
|
|
238
|
+
if not isinstance(sp, search_provider)]
|
|
239
|
+
|
|
240
|
+
def store(self, obj):
|
|
241
|
+
if type(obj) not in self.registry:
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
for provider in self.registry[type(obj)]:
|
|
245
|
+
content = provider.content(obj)
|
|
246
|
+
try:
|
|
247
|
+
metadata = provider.metadata(obj)
|
|
248
|
+
except NotImplementedError:
|
|
249
|
+
metadata = {}
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
identifier = provider.identifier(obj)
|
|
253
|
+
except NotImplementedError:
|
|
254
|
+
pass
|
|
255
|
+
else:
|
|
256
|
+
metadata['identifier'] = identifier
|
|
257
|
+
|
|
258
|
+
self.client.create_document(content, self.index, **metadata)
|
|
259
|
+
|
|
260
|
+
return True
|
|
261
|
+
|
|
262
|
+
def remove(self, obj):
|
|
263
|
+
if type(obj) not in self.registry:
|
|
264
|
+
return False
|
|
265
|
+
|
|
266
|
+
for provider in self.registry[type(obj)]:
|
|
267
|
+
self.client.delete_document(provider.identifier(obj))
|
|
268
|
+
|
|
269
|
+
return True
|
scout/constants.py
ADDED
scout/exceptions.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from flask import jsonify
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class InvalidSearchException(ValueError): pass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class InvalidRequestException(Exception):
|
|
8
|
+
def __init__(self, error_message, code=None):
|
|
9
|
+
self.error_message = error_message
|
|
10
|
+
self.code = code or 400
|
|
11
|
+
|
|
12
|
+
def response(self):
|
|
13
|
+
return jsonify({'error': self.error_message}), self.code
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def error(message, code=None):
|
|
17
|
+
"""
|
|
18
|
+
Trigger an Exception that will short-circuit the Response cycle and return
|
|
19
|
+
a 400 "Bad request" with the given error message.
|
|
20
|
+
"""
|
|
21
|
+
raise InvalidRequestException(message, code=code)
|
scout/models.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import datetime
|
|
3
|
+
import hashlib
|
|
4
|
+
import mimetypes
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from peewee import *
|
|
8
|
+
from playhouse.fields import CompressedField
|
|
9
|
+
from playhouse.sqlite_ext import *
|
|
10
|
+
try:
|
|
11
|
+
from playhouse.sqlite_ext import CSqliteExtDatabase as SqliteExtDatabase
|
|
12
|
+
except ImportError:
|
|
13
|
+
pass
|
|
14
|
+
try:
|
|
15
|
+
from werkzeug import secure_filename
|
|
16
|
+
except ImportError:
|
|
17
|
+
from werkzeug.utils import secure_filename
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if sys.version_info[0] == 2:
|
|
21
|
+
unicode_type = unicode
|
|
22
|
+
else:
|
|
23
|
+
unicode_type = str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
database = SqliteExtDatabase(None, regexp_function=True)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Document(FTSModel):
|
|
30
|
+
"""
|
|
31
|
+
The :py:class:`Document` class contains content which should be indexed
|
|
32
|
+
for search. Documents can be associated with any number of indexes via
|
|
33
|
+
the `IndexDocument` junction table. Because `Document` is implemented
|
|
34
|
+
as an FTS virtual table, it does not support any secondary indexes, and
|
|
35
|
+
all columns have *Text* type, regardless of their declared type. For that
|
|
36
|
+
reason we will utilize the internal SQLite `docid` column to relate
|
|
37
|
+
documents to indexes.
|
|
38
|
+
"""
|
|
39
|
+
content = SearchField()
|
|
40
|
+
identifier = SearchField()
|
|
41
|
+
|
|
42
|
+
class Meta:
|
|
43
|
+
database = database
|
|
44
|
+
options = {
|
|
45
|
+
'prefix': [2, 3],
|
|
46
|
+
'tokenize': 'porter unicode61'}
|
|
47
|
+
table_name = 'main_document'
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def all(cls):
|
|
51
|
+
return Document.select(Document.docid, Document.content,
|
|
52
|
+
Document.identifier)
|
|
53
|
+
|
|
54
|
+
def get_metadata(self):
|
|
55
|
+
return dict(Metadata
|
|
56
|
+
.select(Metadata.key, Metadata.value)
|
|
57
|
+
.where(Metadata.document == self.docid)
|
|
58
|
+
.tuples())
|
|
59
|
+
|
|
60
|
+
def set_metadata(self, metadata):
|
|
61
|
+
(Metadata
|
|
62
|
+
.replace_many([
|
|
63
|
+
{'key': key, 'value': value, 'document': self.docid}
|
|
64
|
+
for key, value in metadata.items()])
|
|
65
|
+
.execute())
|
|
66
|
+
|
|
67
|
+
def delete_metadata(self):
|
|
68
|
+
Metadata.delete().where(Metadata.document == self.docid).execute()
|
|
69
|
+
|
|
70
|
+
metadata = property(get_metadata, set_metadata, delete_metadata)
|
|
71
|
+
|
|
72
|
+
def get_indexes(self):
|
|
73
|
+
return (Index
|
|
74
|
+
.select()
|
|
75
|
+
.join(IndexDocument)
|
|
76
|
+
.where(IndexDocument.document == self.docid))
|
|
77
|
+
|
|
78
|
+
def attach(self, filename, data):
|
|
79
|
+
filename = secure_filename(filename)
|
|
80
|
+
if isinstance(data, unicode_type):
|
|
81
|
+
data = data.encode('utf-8')
|
|
82
|
+
hash_obj = hashlib.sha256(data)
|
|
83
|
+
data_hash = base64.b64encode(hash_obj.digest())
|
|
84
|
+
try:
|
|
85
|
+
with database.atomic():
|
|
86
|
+
data_obj = BlobData.create(hash=data_hash, data=data)
|
|
87
|
+
except IntegrityError:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
mimetype = mimetypes.guess_type(filename)[0] or 'text/plain'
|
|
91
|
+
try:
|
|
92
|
+
with database.atomic():
|
|
93
|
+
attachment = Attachment.create(
|
|
94
|
+
document=self,
|
|
95
|
+
filename=filename,
|
|
96
|
+
hash=data_hash,
|
|
97
|
+
mimetype=mimetype)
|
|
98
|
+
except IntegrityError:
|
|
99
|
+
attachment = (Attachment
|
|
100
|
+
.get((Attachment.document == self) &
|
|
101
|
+
(Attachment.filename == filename)))
|
|
102
|
+
attachment.hash = data_hash
|
|
103
|
+
attachment.mimetype = mimetype
|
|
104
|
+
attachment.save(only=[Attachment.hash, Attachment.mimetype])
|
|
105
|
+
|
|
106
|
+
return attachment
|
|
107
|
+
|
|
108
|
+
def detach(self, filename):
|
|
109
|
+
return (Attachment
|
|
110
|
+
.delete()
|
|
111
|
+
.where((Attachment.document == self) &
|
|
112
|
+
(Attachment.filename == filename))
|
|
113
|
+
.execute())
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class BaseModel(Model):
|
|
117
|
+
class Meta:
|
|
118
|
+
database = database
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class Attachment(BaseModel):
|
|
122
|
+
"""
|
|
123
|
+
A mapping of a BLOB to a Document.
|
|
124
|
+
"""
|
|
125
|
+
document = ForeignKeyField(Document, backref='attachments')
|
|
126
|
+
hash = TextField()
|
|
127
|
+
filename = TextField(index=True)
|
|
128
|
+
mimetype = TextField()
|
|
129
|
+
timestamp = DateTimeField(default=datetime.datetime.now, index=True)
|
|
130
|
+
|
|
131
|
+
class Meta:
|
|
132
|
+
indexes = (
|
|
133
|
+
(('document', 'filename'), True),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def blob(self):
|
|
138
|
+
if not hasattr(self, '_blob'):
|
|
139
|
+
self._blob = BlobData.get(BlobData.hash == self.hash)
|
|
140
|
+
return self._blob
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def length(self):
|
|
144
|
+
return len(self.blob.data)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class BlobData(BaseModel):
|
|
148
|
+
"""Content-addressable BLOB."""
|
|
149
|
+
hash = TextField(primary_key=True)
|
|
150
|
+
data = CompressedField(compression_level=6, algorithm='zlib')
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class Metadata(BaseModel):
|
|
154
|
+
"""
|
|
155
|
+
Arbitrary key/value pairs associated with an indexed `Document`. The
|
|
156
|
+
metadata associated with a document can also be used to filter search
|
|
157
|
+
results.
|
|
158
|
+
"""
|
|
159
|
+
document = ForeignKeyField(Document, backref='metadata_set')
|
|
160
|
+
key = TextField()
|
|
161
|
+
value = TextField()
|
|
162
|
+
|
|
163
|
+
class Meta:
|
|
164
|
+
indexes = (
|
|
165
|
+
(('document', 'key'), True),
|
|
166
|
+
(('key', 'value'), False),
|
|
167
|
+
)
|
|
168
|
+
table_name = 'main_metadata'
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class Index(BaseModel):
|
|
172
|
+
"""
|
|
173
|
+
Indexes contain any number of documents and expose a clean API for
|
|
174
|
+
searching and storing content.
|
|
175
|
+
"""
|
|
176
|
+
name = TextField(unique=True)
|
|
177
|
+
|
|
178
|
+
class Meta:
|
|
179
|
+
table_name = 'main_index'
|
|
180
|
+
|
|
181
|
+
def add_to_index(self, document):
|
|
182
|
+
with database.atomic():
|
|
183
|
+
try:
|
|
184
|
+
IndexDocument.create(index=self, document=document)
|
|
185
|
+
except IntegrityError:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
def index(self, content, document=None, identifier=None, **metadata):
|
|
189
|
+
if document is None:
|
|
190
|
+
document = Document.create(
|
|
191
|
+
content=content,
|
|
192
|
+
identifier=identifier)
|
|
193
|
+
else:
|
|
194
|
+
del document.metadata
|
|
195
|
+
nrows = (Document
|
|
196
|
+
.update(
|
|
197
|
+
content=content,
|
|
198
|
+
identifier=identifier)
|
|
199
|
+
.where(Document.docid == document.docid)
|
|
200
|
+
.execute())
|
|
201
|
+
|
|
202
|
+
self.add_to_index(document)
|
|
203
|
+
if metadata:
|
|
204
|
+
document.metadata = metadata
|
|
205
|
+
return document
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def documents(self):
|
|
209
|
+
return (Document
|
|
210
|
+
.all()
|
|
211
|
+
.join(IndexDocument)
|
|
212
|
+
.where(IndexDocument.index == self))
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class IndexDocument(BaseModel):
|
|
216
|
+
index = ForeignKeyField(Index)
|
|
217
|
+
document = ForeignKeyField(Document)
|
|
218
|
+
|
|
219
|
+
class Meta:
|
|
220
|
+
indexes = (
|
|
221
|
+
(('index', 'document'), True),
|
|
222
|
+
)
|
|
223
|
+
table_name = 'main_index_document'
|
scout/search.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from functools import reduce
|
|
3
|
+
except ImportError:
|
|
4
|
+
pass
|
|
5
|
+
import operator
|
|
6
|
+
|
|
7
|
+
from peewee import fn
|
|
8
|
+
from peewee import Select
|
|
9
|
+
|
|
10
|
+
from .constants import PROTECTED_KEYS
|
|
11
|
+
from .constants import SEARCH_BM25
|
|
12
|
+
from .constants import SEARCH_NONE
|
|
13
|
+
from .constants import SEARCH_SIMPLE
|
|
14
|
+
from .exceptions import InvalidSearchException
|
|
15
|
+
from .exceptions import error
|
|
16
|
+
from .models import Document
|
|
17
|
+
from .models import Index
|
|
18
|
+
from .models import IndexDocument
|
|
19
|
+
from .models import Metadata
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DocumentSearch(object):
|
|
23
|
+
def search(self, phrase, index=None, ranking='bm25', ordering=None,
|
|
24
|
+
**filters):
|
|
25
|
+
phrase = phrase.strip()
|
|
26
|
+
if not phrase:
|
|
27
|
+
raise InvalidSearchException('Must provide a search query.')
|
|
28
|
+
elif phrase == '*' or ranking == SEARCH_NONE:
|
|
29
|
+
ranking = None
|
|
30
|
+
|
|
31
|
+
query = Document.select()
|
|
32
|
+
if phrase != '*':
|
|
33
|
+
query = query.where(Document.match(phrase))
|
|
34
|
+
|
|
35
|
+
# Allow filtering by index(es).
|
|
36
|
+
if index is not None:
|
|
37
|
+
query = query.join(IndexDocument)
|
|
38
|
+
if isinstance(index, (list, tuple, Select)):
|
|
39
|
+
query = query.where(IndexDocument.index << index)
|
|
40
|
+
else:
|
|
41
|
+
query = query.where(IndexDocument.index == index)
|
|
42
|
+
|
|
43
|
+
# Allow filtering by metadata.
|
|
44
|
+
metadata_expr = self.get_metadata_filter_expression(filters)
|
|
45
|
+
if metadata_expr is not None:
|
|
46
|
+
query = query.where(metadata_expr)
|
|
47
|
+
|
|
48
|
+
# Allow sorting and ranking.
|
|
49
|
+
return self.apply_rank_and_sort(query, ranking, ordering or ())
|
|
50
|
+
|
|
51
|
+
def get_metadata_filter_expression(self, filters):
|
|
52
|
+
valid_keys = [key for key in filters if key not in PROTECTED_KEYS]
|
|
53
|
+
if valid_keys:
|
|
54
|
+
return reduce(operator.and_, [
|
|
55
|
+
self._build_filter_expression(key, values)
|
|
56
|
+
for key, values in filters.items()])
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _build_filter_expression(key, values):
|
|
60
|
+
def in_(lhs, rhs):
|
|
61
|
+
return lhs << ([i.strip() for i in rhs.split(',')])
|
|
62
|
+
operations = {
|
|
63
|
+
'eq': operator.eq,
|
|
64
|
+
'ne': operator.ne,
|
|
65
|
+
'ge': operator.ge,
|
|
66
|
+
'gt': operator.gt,
|
|
67
|
+
'le': operator.le,
|
|
68
|
+
'lt': operator.lt,
|
|
69
|
+
'in': in_,
|
|
70
|
+
'contains': lambda l, r: operator.pow(l, '%%%s%%' % r),
|
|
71
|
+
'startswith': lambda l, r: operator.pow(l, '%s%%' % r),
|
|
72
|
+
'endswith': lambda l, r: operator.pow(l, '%%%s' % r),
|
|
73
|
+
'regex': lambda l, r: l.regexp(r),
|
|
74
|
+
}
|
|
75
|
+
if key.find('__') != -1:
|
|
76
|
+
key, op = key.rsplit('__', 1)
|
|
77
|
+
if op not in operations:
|
|
78
|
+
error('Unrecognized operation: %s. Supported operations are:'
|
|
79
|
+
'\n%s' % (op, '\n'.join(sorted(operations.keys()))))
|
|
80
|
+
else:
|
|
81
|
+
op = 'eq'
|
|
82
|
+
|
|
83
|
+
op = operations[op]
|
|
84
|
+
if isinstance(values, (list, tuple)):
|
|
85
|
+
expr = reduce(operator.or_, [
|
|
86
|
+
((Metadata.key == key) & op(Metadata.value, value))
|
|
87
|
+
for value in values])
|
|
88
|
+
else:
|
|
89
|
+
expr = ((Metadata.key == key) & op(Metadata.value, values))
|
|
90
|
+
|
|
91
|
+
return fn.EXISTS(Metadata.select().where(
|
|
92
|
+
expr &
|
|
93
|
+
(Metadata.document == Document.docid)))
|
|
94
|
+
|
|
95
|
+
def apply_rank_and_sort(self, query, ranking, ordering, sort_options=None,
|
|
96
|
+
sort_default='id'):
|
|
97
|
+
sort_options = sort_options or {
|
|
98
|
+
'content': Document.content,
|
|
99
|
+
'id': Document.docid,
|
|
100
|
+
'identifier': Document.identifier,
|
|
101
|
+
}
|
|
102
|
+
if ranking is not None:
|
|
103
|
+
rank = self.get_rank_expression(ranking)
|
|
104
|
+
sort_options['score'] = rank
|
|
105
|
+
sort_default = 'score'
|
|
106
|
+
|
|
107
|
+
# Add score to the selected columns.
|
|
108
|
+
query = query.select(*query._returning + [rank.alias('score')])
|
|
109
|
+
|
|
110
|
+
return self.apply_sorting(query, ordering, sort_options, sort_default)
|
|
111
|
+
|
|
112
|
+
def get_rank_expression(self, ranking):
|
|
113
|
+
if ranking == SEARCH_BM25:
|
|
114
|
+
# Search only the content field, do not search the identifiers.
|
|
115
|
+
return Document.bm25(1.0, 0.0)
|
|
116
|
+
elif ranking == SEARCH_SIMPLE:
|
|
117
|
+
# Search only the content field, do not search the identifiers.
|
|
118
|
+
return Document.rank(1.0, 0.0)
|
|
119
|
+
else:
|
|
120
|
+
error('Unrecognized ranking: "%s"' % ranking)
|
|
121
|
+
|
|
122
|
+
def apply_sorting(self, query, ordering, mapping, default):
|
|
123
|
+
sortables = [part.strip() for part in ordering]
|
|
124
|
+
accum = []
|
|
125
|
+
for identifier in sortables:
|
|
126
|
+
is_desc = identifier.startswith('-')
|
|
127
|
+
identifier = identifier.lstrip('-')
|
|
128
|
+
if identifier in mapping:
|
|
129
|
+
value = mapping[identifier]
|
|
130
|
+
accum.append(value.desc() if is_desc else value)
|
|
131
|
+
|
|
132
|
+
if not accum:
|
|
133
|
+
accum = [mapping[default]]
|
|
134
|
+
|
|
135
|
+
return query.order_by(*accum)
|