scout 3.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scout/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = '3.0.4'
scout/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+ from scout.server import main
2
+
3
+
4
+ if __name__ == '__main__':
5
+ main()
scout/client.py ADDED
@@ -0,0 +1,269 @@
1
+ import base64
2
+ import json
3
+ try:
4
+ from email.generator import _make_boundary as choose_boundary
5
+ except ImportError:
6
+ from mimetools import choose_boundary
7
+ import mimetypes
8
+ import os
9
+ try:
10
+ from urllib.parse import urlencode
11
+ except ImportError:
12
+ from urllib import urlencode
13
+ try:
14
+ from urllib.request import Request
15
+ from urllib.request import urlopen
16
+ except ImportError:
17
+ from urllib2 import Request
18
+ from urllib2 import urlopen
19
+ import zlib
20
+
21
+
22
+ ENDPOINT = None
23
+ KEY = None
24
+
25
+
26
+ class Scout(object):
27
+ def __init__(self, endpoint=ENDPOINT, key=KEY):
28
+ self.endpoint = endpoint.rstrip('/')
29
+ self.key = key
30
+
31
+ def get_full_url(self, url):
32
+ return self.endpoint + url
33
+
34
+ def get_raw(self, url, **kwargs):
35
+ headers = {'Content-Type': 'application/json'}
36
+ if self.key:
37
+ headers['key'] = self.key
38
+ if kwargs:
39
+ if '?' not in url:
40
+ url += '?'
41
+ url += urlencode(kwargs, True)
42
+ request = Request(self.get_full_url(url), headers=headers)
43
+ fh = urlopen(request)
44
+ return fh.read()
45
+
46
+ def get(self, url, **kwargs):
47
+ return json.loads(self.get_raw(url, **kwargs))
48
+
49
+ def post(self, url, data=None, files=None):
50
+ if files:
51
+ return self.post_files(url, data, files)
52
+ else:
53
+ return self.post_json(url, data)
54
+
55
+ def post_json(self, url, data=None):
56
+ headers = {'Content-Type': 'application/json'}
57
+ if self.key:
58
+ headers['key'] = self.key
59
+ data = json.dumps(data or {})
60
+ if not isinstance(data, bytes):
61
+ data = data.encode('utf-8')
62
+ request = Request(self.get_full_url(url), data=data, headers=headers)
63
+ return json.loads(urlopen(request).read().decode('utf8'))
64
+
65
+ def post_files(self, url, json_data, files=None):
66
+ if not files or not isinstance(files, dict):
67
+ raise ValueError('One or more files is required. Files should be '
68
+ 'passed as a dictionary of filename: file-like-'
69
+ 'object.')
70
+ boundary = choose_boundary()
71
+ form_files = []
72
+ for i, (filename, file_obj) in enumerate(files.items()):
73
+ try:
74
+ data = file_obj.read()
75
+ except AttributeError:
76
+ data = bytes(file_obj)
77
+ mimetype = mimetypes.guess_type(filename)[0]
78
+ form_files.append((
79
+ 'file_%s' % i,
80
+ filename,
81
+ mimetype or 'application/octet-stream',
82
+ data))
83
+
84
+ part_boundary = '--' + boundary
85
+ parts = [
86
+ part_boundary,
87
+ 'Content-Disposition: form-data; name="data"',
88
+ '',
89
+ json.dumps(json_data)]
90
+ for field_name, filename, mimetype, data in form_files:
91
+ parts.extend((
92
+ part_boundary,
93
+ 'Content-Disposition: file; name="%s"; filename="%s"' % (
94
+ field_name, filename),
95
+ 'Content-Type: %s' % mimetype,
96
+ '',
97
+ data))
98
+ parts.append('--' + boundary + '--')
99
+ parts.append('')
100
+
101
+ headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
102
+ boundary}
103
+ if self.key:
104
+ headers['key'] = self.key
105
+
106
+ data = '\r\n'.join(parts)
107
+ if not isinstance(data, bytes):
108
+ data = data.encode('utf-8')
109
+
110
+ request = Request(self.get_full_url(url), data=data, headers=headers)
111
+ return json.loads(urlopen(request).read())
112
+
113
+ def delete(self, url):
114
+ headers = {}
115
+ if self.key:
116
+ headers['key'] = self.key
117
+ request = Request(self.get_full_url(url), headers=headers)
118
+ request.get_method = lambda: 'DELETE'
119
+ fh = urlopen(request)
120
+ return json.loads(fh.read())
121
+
122
+ def get_indexes(self, **kwargs):
123
+ return self.get('/', **kwargs)['indexes']
124
+
125
+ def create_index(self, name):
126
+ return self.post('/', {'name': name})
127
+
128
+ def rename_index(self, old_name, new_name):
129
+ return self.post('/%s/' % old_name, {'name': new_name})
130
+
131
+ def delete_index(self, name):
132
+ return self.delete('/%s/' % name)
133
+
134
+ def get_index(self, name, **kwargs):
135
+ return self.get('/%s/' % name, **kwargs)
136
+
137
+ def get_documents(self, **kwargs):
138
+ return self.get('/documents/', **kwargs)
139
+
140
+ def create_document(self, content, indexes, identifier=None,
141
+ attachments=None, **metadata):
142
+ if not isinstance(indexes, (list, tuple)):
143
+ indexes = [indexes]
144
+ post_data = {
145
+ 'content': content,
146
+ 'identifier': identifier,
147
+ 'indexes': indexes,
148
+ 'metadata': metadata}
149
+ return self.post('/documents/', post_data, attachments)
150
+
151
+ def update_document(self, document_id=None, content=None, indexes=None,
152
+ metadata=None, identifier=None, attachments=None):
153
+ if not document_id and not identifier:
154
+ raise ValueError('`document_id` must be provided.')
155
+
156
+ data = {}
157
+ if content is not None:
158
+ data['content'] = content
159
+ if indexes is not None:
160
+ if not isinstance(indexes, (list, tuple)):
161
+ indexes = [indexes]
162
+ data['indexes'] = indexes
163
+ if metadata is not None:
164
+ data['metadata'] = metadata
165
+
166
+ if not data and not attachments:
167
+ raise ValueError('Nothing to update.')
168
+
169
+ return self.post('/documents/%s/' % document_id, data, attachments)
170
+
171
+ def delete_document(self, document_id=None):
172
+ if not document_id:
173
+ raise ValueError('`document_id` must be provided.')
174
+
175
+ return self.delete('/documents/%s/' % document_id)
176
+
177
+ def get_document(self, document_id=None):
178
+ if not document_id:
179
+ raise ValueError('`document_id` must be provided.')
180
+
181
+ return self.get('/documents/%s/' % document_id)
182
+
183
+ def attach_files(self, document_id, attachments):
184
+ return self.post_files('/documents/%s/attachments/' % document_id,
185
+ {}, attachments)
186
+
187
+ def detach_file(self, document_id, filename):
188
+ return self.delete('/documents/%s/attachments/%s/' %
189
+ (document_id, filename))
190
+
191
+ def update_file(self, document_id, filename, file_object):
192
+ return self.post_files('/documents/%s/attachments/%s/' %
193
+ (document_id, filename),
194
+ {}, {filename: file_object})
195
+
196
+ def get_attachments(self, document_id, **kwargs):
197
+ return self.get('/documents/%s/attachments/' % document_id, **kwargs)
198
+
199
+ def get_attachment(self, document_id, filename):
200
+ return self.get('/documents/%s/attachments/%s/' %
201
+ (document_id, filename))
202
+
203
+ def download_attachment(self, document_id, filename):
204
+ return self.get_raw('/documents/%s/attachments/%s/download/' %
205
+ (document_id, filename))
206
+
207
+ def search_attachments(self, **kwargs):
208
+ return self.get('/documents/attachments/', **kwargs)
209
+
210
+
211
+ class SearchProvider(object):
212
+ def content(self, obj):
213
+ raise NotImplementedError
214
+
215
+ def identifier(self, obj):
216
+ raise NotImplementedError
217
+
218
+ def metadata(self, obj):
219
+ raise NotImplementedError
220
+
221
+
222
+ class SearchSite(object):
223
+ def __init__(self, client, index):
224
+ self.client = client
225
+ self.index = index
226
+ self.registry = {}
227
+
228
+ def register(self, model_class, search_provider):
229
+ self.registry.setdefault(model_class, [])
230
+ self.registry[model_class].append(search_provider())
231
+
232
+ def unregister(self, model_class, search_provider=None):
233
+ if search_provider is None:
234
+ self.registry.pop(model_class, None)
235
+ elif model_class in self.registry:
236
+ self.registry[model_class] = [
237
+ sp for sp in self.registry[model_class]
238
+ if not isinstance(sp, search_provider)]
239
+
240
+ def store(self, obj):
241
+ if type(obj) not in self.registry:
242
+ return False
243
+
244
+ for provider in self.registry[type(obj)]:
245
+ content = provider.content(obj)
246
+ try:
247
+ metadata = provider.metadata(obj)
248
+ except NotImplementedError:
249
+ metadata = {}
250
+
251
+ try:
252
+ identifier = provider.identifier(obj)
253
+ except NotImplementedError:
254
+ pass
255
+ else:
256
+ metadata['identifier'] = identifier
257
+
258
+ self.client.create_document(content, self.index, **metadata)
259
+
260
+ return True
261
+
262
+ def remove(self, obj):
263
+ if type(obj) not in self.registry:
264
+ return False
265
+
266
+ for provider in self.registry[type(obj)]:
267
+ self.client.delete_document(provider.identifier(obj))
268
+
269
+ return True
scout/constants.py ADDED
@@ -0,0 +1,7 @@
1
+ SEARCH_BM25 = 'bm25'
2
+ SEARCH_SIMPLE = 'simple'
3
+ SEARCH_NONE = 'none'
4
+ RANKING_CHOICES = (SEARCH_BM25, SEARCH_SIMPLE, SEARCH_NONE)
5
+
6
+ PROTECTED_KEYS = set(('page', 'q', 'key', 'ranking', 'identifier', 'index',
7
+ 'ordering'))
scout/exceptions.py ADDED
@@ -0,0 +1,21 @@
1
+ from flask import jsonify
2
+
3
+
4
+ class InvalidSearchException(ValueError): pass
5
+
6
+
7
+ class InvalidRequestException(Exception):
8
+ def __init__(self, error_message, code=None):
9
+ self.error_message = error_message
10
+ self.code = code or 400
11
+
12
+ def response(self):
13
+ return jsonify({'error': self.error_message}), self.code
14
+
15
+
16
+ def error(message, code=None):
17
+ """
18
+ Trigger an Exception that will short-circuit the Response cycle and return
19
+ a 400 "Bad request" with the given error message.
20
+ """
21
+ raise InvalidRequestException(message, code=code)
scout/models.py ADDED
@@ -0,0 +1,223 @@
1
+ import base64
2
+ import datetime
3
+ import hashlib
4
+ import mimetypes
5
+ import sys
6
+
7
+ from peewee import *
8
+ from playhouse.fields import CompressedField
9
+ from playhouse.sqlite_ext import *
10
+ try:
11
+ from playhouse.sqlite_ext import CSqliteExtDatabase as SqliteExtDatabase
12
+ except ImportError:
13
+ pass
14
+ try:
15
+ from werkzeug import secure_filename
16
+ except ImportError:
17
+ from werkzeug.utils import secure_filename
18
+
19
+
20
+ if sys.version_info[0] == 2:
21
+ unicode_type = unicode
22
+ else:
23
+ unicode_type = str
24
+
25
+
26
+ database = SqliteExtDatabase(None, regexp_function=True)
27
+
28
+
29
+ class Document(FTSModel):
30
+ """
31
+ The :py:class:`Document` class contains content which should be indexed
32
+ for search. Documents can be associated with any number of indexes via
33
+ the `IndexDocument` junction table. Because `Document` is implemented
34
+ as an FTS virtual table, it does not support any secondary indexes, and
35
+ all columns have *Text* type, regardless of their declared type. For that
36
+ reason we will utilize the internal SQLite `docid` column to relate
37
+ documents to indexes.
38
+ """
39
+ content = SearchField()
40
+ identifier = SearchField()
41
+
42
+ class Meta:
43
+ database = database
44
+ options = {
45
+ 'prefix': [2, 3],
46
+ 'tokenize': 'porter unicode61'}
47
+ table_name = 'main_document'
48
+
49
+ @classmethod
50
+ def all(cls):
51
+ return Document.select(Document.docid, Document.content,
52
+ Document.identifier)
53
+
54
+ def get_metadata(self):
55
+ return dict(Metadata
56
+ .select(Metadata.key, Metadata.value)
57
+ .where(Metadata.document == self.docid)
58
+ .tuples())
59
+
60
+ def set_metadata(self, metadata):
61
+ (Metadata
62
+ .replace_many([
63
+ {'key': key, 'value': value, 'document': self.docid}
64
+ for key, value in metadata.items()])
65
+ .execute())
66
+
67
+ def delete_metadata(self):
68
+ Metadata.delete().where(Metadata.document == self.docid).execute()
69
+
70
+ metadata = property(get_metadata, set_metadata, delete_metadata)
71
+
72
+ def get_indexes(self):
73
+ return (Index
74
+ .select()
75
+ .join(IndexDocument)
76
+ .where(IndexDocument.document == self.docid))
77
+
78
+ def attach(self, filename, data):
79
+ filename = secure_filename(filename)
80
+ if isinstance(data, unicode_type):
81
+ data = data.encode('utf-8')
82
+ hash_obj = hashlib.sha256(data)
83
+ data_hash = base64.b64encode(hash_obj.digest())
84
+ try:
85
+ with database.atomic():
86
+ data_obj = BlobData.create(hash=data_hash, data=data)
87
+ except IntegrityError:
88
+ pass
89
+
90
+ mimetype = mimetypes.guess_type(filename)[0] or 'text/plain'
91
+ try:
92
+ with database.atomic():
93
+ attachment = Attachment.create(
94
+ document=self,
95
+ filename=filename,
96
+ hash=data_hash,
97
+ mimetype=mimetype)
98
+ except IntegrityError:
99
+ attachment = (Attachment
100
+ .get((Attachment.document == self) &
101
+ (Attachment.filename == filename)))
102
+ attachment.hash = data_hash
103
+ attachment.mimetype = mimetype
104
+ attachment.save(only=[Attachment.hash, Attachment.mimetype])
105
+
106
+ return attachment
107
+
108
+ def detach(self, filename):
109
+ return (Attachment
110
+ .delete()
111
+ .where((Attachment.document == self) &
112
+ (Attachment.filename == filename))
113
+ .execute())
114
+
115
+
116
+ class BaseModel(Model):
117
+ class Meta:
118
+ database = database
119
+
120
+
121
+ class Attachment(BaseModel):
122
+ """
123
+ A mapping of a BLOB to a Document.
124
+ """
125
+ document = ForeignKeyField(Document, backref='attachments')
126
+ hash = TextField()
127
+ filename = TextField(index=True)
128
+ mimetype = TextField()
129
+ timestamp = DateTimeField(default=datetime.datetime.now, index=True)
130
+
131
+ class Meta:
132
+ indexes = (
133
+ (('document', 'filename'), True),
134
+ )
135
+
136
+ @property
137
+ def blob(self):
138
+ if not hasattr(self, '_blob'):
139
+ self._blob = BlobData.get(BlobData.hash == self.hash)
140
+ return self._blob
141
+
142
+ @property
143
+ def length(self):
144
+ return len(self.blob.data)
145
+
146
+
147
+ class BlobData(BaseModel):
148
+ """Content-addressable BLOB."""
149
+ hash = TextField(primary_key=True)
150
+ data = CompressedField(compression_level=6, algorithm='zlib')
151
+
152
+
153
+ class Metadata(BaseModel):
154
+ """
155
+ Arbitrary key/value pairs associated with an indexed `Document`. The
156
+ metadata associated with a document can also be used to filter search
157
+ results.
158
+ """
159
+ document = ForeignKeyField(Document, backref='metadata_set')
160
+ key = TextField()
161
+ value = TextField()
162
+
163
+ class Meta:
164
+ indexes = (
165
+ (('document', 'key'), True),
166
+ (('key', 'value'), False),
167
+ )
168
+ table_name = 'main_metadata'
169
+
170
+
171
+ class Index(BaseModel):
172
+ """
173
+ Indexes contain any number of documents and expose a clean API for
174
+ searching and storing content.
175
+ """
176
+ name = TextField(unique=True)
177
+
178
+ class Meta:
179
+ table_name = 'main_index'
180
+
181
+ def add_to_index(self, document):
182
+ with database.atomic():
183
+ try:
184
+ IndexDocument.create(index=self, document=document)
185
+ except IntegrityError:
186
+ pass
187
+
188
+ def index(self, content, document=None, identifier=None, **metadata):
189
+ if document is None:
190
+ document = Document.create(
191
+ content=content,
192
+ identifier=identifier)
193
+ else:
194
+ del document.metadata
195
+ nrows = (Document
196
+ .update(
197
+ content=content,
198
+ identifier=identifier)
199
+ .where(Document.docid == document.docid)
200
+ .execute())
201
+
202
+ self.add_to_index(document)
203
+ if metadata:
204
+ document.metadata = metadata
205
+ return document
206
+
207
+ @property
208
+ def documents(self):
209
+ return (Document
210
+ .all()
211
+ .join(IndexDocument)
212
+ .where(IndexDocument.index == self))
213
+
214
+
215
+ class IndexDocument(BaseModel):
216
+ index = ForeignKeyField(Index)
217
+ document = ForeignKeyField(Document)
218
+
219
+ class Meta:
220
+ indexes = (
221
+ (('index', 'document'), True),
222
+ )
223
+ table_name = 'main_index_document'
scout/search.py ADDED
@@ -0,0 +1,135 @@
1
+ try:
2
+ from functools import reduce
3
+ except ImportError:
4
+ pass
5
+ import operator
6
+
7
+ from peewee import fn
8
+ from peewee import Select
9
+
10
+ from .constants import PROTECTED_KEYS
11
+ from .constants import SEARCH_BM25
12
+ from .constants import SEARCH_NONE
13
+ from .constants import SEARCH_SIMPLE
14
+ from .exceptions import InvalidSearchException
15
+ from .exceptions import error
16
+ from .models import Document
17
+ from .models import Index
18
+ from .models import IndexDocument
19
+ from .models import Metadata
20
+
21
+
22
+ class DocumentSearch(object):
23
+ def search(self, phrase, index=None, ranking='bm25', ordering=None,
24
+ **filters):
25
+ phrase = phrase.strip()
26
+ if not phrase:
27
+ raise InvalidSearchException('Must provide a search query.')
28
+ elif phrase == '*' or ranking == SEARCH_NONE:
29
+ ranking = None
30
+
31
+ query = Document.select()
32
+ if phrase != '*':
33
+ query = query.where(Document.match(phrase))
34
+
35
+ # Allow filtering by index(es).
36
+ if index is not None:
37
+ query = query.join(IndexDocument)
38
+ if isinstance(index, (list, tuple, Select)):
39
+ query = query.where(IndexDocument.index << index)
40
+ else:
41
+ query = query.where(IndexDocument.index == index)
42
+
43
+ # Allow filtering by metadata.
44
+ metadata_expr = self.get_metadata_filter_expression(filters)
45
+ if metadata_expr is not None:
46
+ query = query.where(metadata_expr)
47
+
48
+ # Allow sorting and ranking.
49
+ return self.apply_rank_and_sort(query, ranking, ordering or ())
50
+
51
+ def get_metadata_filter_expression(self, filters):
52
+ valid_keys = [key for key in filters if key not in PROTECTED_KEYS]
53
+ if valid_keys:
54
+ return reduce(operator.and_, [
55
+ self._build_filter_expression(key, values)
56
+ for key, values in filters.items()])
57
+
58
+ @staticmethod
59
+ def _build_filter_expression(key, values):
60
+ def in_(lhs, rhs):
61
+ return lhs << ([i.strip() for i in rhs.split(',')])
62
+ operations = {
63
+ 'eq': operator.eq,
64
+ 'ne': operator.ne,
65
+ 'ge': operator.ge,
66
+ 'gt': operator.gt,
67
+ 'le': operator.le,
68
+ 'lt': operator.lt,
69
+ 'in': in_,
70
+ 'contains': lambda l, r: operator.pow(l, '%%%s%%' % r),
71
+ 'startswith': lambda l, r: operator.pow(l, '%s%%' % r),
72
+ 'endswith': lambda l, r: operator.pow(l, '%%%s' % r),
73
+ 'regex': lambda l, r: l.regexp(r),
74
+ }
75
+ if key.find('__') != -1:
76
+ key, op = key.rsplit('__', 1)
77
+ if op not in operations:
78
+ error('Unrecognized operation: %s. Supported operations are:'
79
+ '\n%s' % (op, '\n'.join(sorted(operations.keys()))))
80
+ else:
81
+ op = 'eq'
82
+
83
+ op = operations[op]
84
+ if isinstance(values, (list, tuple)):
85
+ expr = reduce(operator.or_, [
86
+ ((Metadata.key == key) & op(Metadata.value, value))
87
+ for value in values])
88
+ else:
89
+ expr = ((Metadata.key == key) & op(Metadata.value, values))
90
+
91
+ return fn.EXISTS(Metadata.select().where(
92
+ expr &
93
+ (Metadata.document == Document.docid)))
94
+
95
+ def apply_rank_and_sort(self, query, ranking, ordering, sort_options=None,
96
+ sort_default='id'):
97
+ sort_options = sort_options or {
98
+ 'content': Document.content,
99
+ 'id': Document.docid,
100
+ 'identifier': Document.identifier,
101
+ }
102
+ if ranking is not None:
103
+ rank = self.get_rank_expression(ranking)
104
+ sort_options['score'] = rank
105
+ sort_default = 'score'
106
+
107
+ # Add score to the selected columns.
108
+ query = query.select(*query._returning + [rank.alias('score')])
109
+
110
+ return self.apply_sorting(query, ordering, sort_options, sort_default)
111
+
112
+ def get_rank_expression(self, ranking):
113
+ if ranking == SEARCH_BM25:
114
+ # Search only the content field, do not search the identifiers.
115
+ return Document.bm25(1.0, 0.0)
116
+ elif ranking == SEARCH_SIMPLE:
117
+ # Search only the content field, do not search the identifiers.
118
+ return Document.rank(1.0, 0.0)
119
+ else:
120
+ error('Unrecognized ranking: "%s"' % ranking)
121
+
122
+ def apply_sorting(self, query, ordering, mapping, default):
123
+ sortables = [part.strip() for part in ordering]
124
+ accum = []
125
+ for identifier in sortables:
126
+ is_desc = identifier.startswith('-')
127
+ identifier = identifier.lstrip('-')
128
+ if identifier in mapping:
129
+ value = mapping[identifier]
130
+ accum.append(value.desc() if is_desc else value)
131
+
132
+ if not accum:
133
+ accum = [mapping[default]]
134
+
135
+ return query.order_by(*accum)