sup 0.10.2 → 0.11

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sup might be problematic. Click here for more details.

@@ -1,605 +0,0 @@
1
- ENV["XAPIAN_FLUSH_THRESHOLD"] = "1000"
2
-
3
- require 'xapian'
4
- require 'set'
5
-
6
- module Redwood
7
-
8
- # This index implementation uses Xapian for searching and storage. It
9
- # tends to be slightly faster than Ferret for indexing and significantly faster
10
- # for searching due to precomputing thread membership.
11
- class XapianIndex < BaseIndex
12
- STEM_LANGUAGE = "english"
13
- INDEX_VERSION = '1'
14
-
15
- ## dates are converted to integers for xapian, and are used for document ids,
16
- ## so we must ensure they're reasonably valid. this typically only affect
17
- ## spam.
18
- MIN_DATE = Time.at 0
19
- MAX_DATE = Time.at(2**31-1)
20
-
21
- HookManager.register "custom-search", <<EOS
22
- Executes before a string search is applied to the index,
23
- returning a new search string.
24
- Variables:
25
- subs: The string being searched.
26
- EOS
27
-
28
- def initialize dir=BASE_DIR
29
- super
30
-
31
- @index_mutex = Monitor.new
32
- end
33
-
34
- def load_index
35
- path = File.join(@dir, 'xapian')
36
- if File.exists? path
37
- @xapian = Xapian::WritableDatabase.new(path, Xapian::DB_OPEN)
38
- db_version = @xapian.get_metadata 'version'
39
- db_version = '0' if db_version.empty?
40
- if db_version != INDEX_VERSION
41
- fail "This Sup version expects a v#{INDEX_VERSION} index, but you have an existing v#{db_version} index. Please downgrade to your previous version and dump your labels before upgrading to this version (then run sup-sync --restore)."
42
- end
43
- else
44
- @xapian = Xapian::WritableDatabase.new(path, Xapian::DB_CREATE)
45
- @xapian.set_metadata 'version', INDEX_VERSION
46
- end
47
- @enquire = Xapian::Enquire.new @xapian
48
- @enquire.weighting_scheme = Xapian::BoolWeight.new
49
- @enquire.docid_order = Xapian::Enquire::ASCENDING
50
- end
51
-
52
- def save_index
53
- info "Flushing Xapian updates to disk. This may take a while..."
54
- @xapian.flush
55
- end
56
-
57
- def optimize
58
- end
59
-
60
- def size
61
- synchronize { @xapian.doccount }
62
- end
63
-
64
- def contains_id? id
65
- synchronize { find_docid(id) && true }
66
- end
67
-
68
- def source_for_id id
69
- synchronize { get_entry(id)[:source_id] }
70
- end
71
-
72
- def delete id
73
- synchronize { @xapian.delete_document mkterm(:msgid, id) }
74
- end
75
-
76
- def build_message id
77
- entry = synchronize { get_entry id }
78
- return unless entry
79
-
80
- source = SourceManager[entry[:source_id]]
81
- raise "invalid source #{entry[:source_id]}" unless source
82
-
83
- m = Message.new :source => source, :source_info => entry[:source_info],
84
- :labels => entry[:labels], :snippet => entry[:snippet]
85
-
86
- mk_person = lambda { |x| Person.new(*x.reverse!) }
87
- entry[:from] = mk_person[entry[:from]]
88
- entry[:to].map!(&mk_person)
89
- entry[:cc].map!(&mk_person)
90
- entry[:bcc].map!(&mk_person)
91
-
92
- m.load_from_index! entry
93
- m
94
- end
95
-
96
- def add_message m; sync_message m, true end
97
- def update_message m; sync_message m, true end
98
- def update_message_state m; sync_message m, false end
99
-
100
- def num_results_for query={}
101
- xapian_query = build_xapian_query query
102
- matchset = run_query xapian_query, 0, 0, 100
103
- matchset.matches_estimated
104
- end
105
-
106
- EACH_ID_PAGE = 100
107
- def each_id query={}
108
- offset = 0
109
- page = EACH_ID_PAGE
110
-
111
- xapian_query = build_xapian_query query
112
- while true
113
- ids = run_query_ids xapian_query, offset, (offset+page)
114
- ids.each { |id| yield id }
115
- break if ids.size < page
116
- offset += page
117
- end
118
- end
119
-
120
- def each_id_by_date query={}
121
- each_id(query) { |id| yield id, lambda { build_message id } }
122
- end
123
-
124
- def each_message_in_thread_for m, opts={}
125
- # TODO thread by subject
126
- return unless doc = find_doc(m.id)
127
- queue = doc.value(THREAD_VALUENO).split(',')
128
- msgids = [m.id]
129
- seen_threads = Set.new
130
- seen_messages = Set.new [m.id]
131
- while not queue.empty?
132
- thread_id = queue.pop
133
- next if seen_threads.member? thread_id
134
- return false if opts[:skip_killed] && thread_killed?(thread_id)
135
- seen_threads << thread_id
136
- docs = term_docids(mkterm(:thread, thread_id)).map { |x| @xapian.document x }
137
- docs.each do |doc|
138
- msgid = doc.value MSGID_VALUENO
139
- next if seen_messages.member? msgid
140
- msgids << msgid
141
- seen_messages << msgid
142
- queue.concat doc.value(THREAD_VALUENO).split(',')
143
- end
144
- end
145
- msgids.each { |id| yield id, lambda { build_message id } }
146
- true
147
- end
148
-
149
- def load_contacts emails, opts={}
150
- contacts = Set.new
151
- num = opts[:num] || 20
152
- each_id_by_date :participants => emails do |id,b|
153
- break if contacts.size >= num
154
- m = b.call
155
- ([m.from]+m.to+m.cc+m.bcc).compact.each { |p| contacts << [p.name, p.email] }
156
- end
157
- contacts.to_a.compact.map { |n,e| Person.new n, e }[0...num]
158
- end
159
-
160
- # TODO share code with the Ferret index
161
- def parse_query s
162
- query = {}
163
-
164
- subs = HookManager.run("custom-search", :subs => s) || s
165
- subs = subs.gsub(/\b(to|from):(\S+)\b/) do
166
- field, value = $1, $2
167
- email_field, name_field = %w(email name).map { |x| "#{field}_#{x}" }
168
- if(p = ContactManager.contact_for(value))
169
- "#{email_field}:#{p.email}"
170
- elsif value == "me"
171
- '(' + AccountManager.user_emails.map { |e| "#{email_field}:#{e}" }.join(' OR ') + ')'
172
- else
173
- "(#{email_field}:#{value} OR #{name_field}:#{value})"
174
- end
175
- end
176
-
177
- ## if we see a label:deleted or a label:spam term anywhere in the query
178
- ## string, we set the extra load_spam or load_deleted options to true.
179
- ## bizarre? well, because the query allows arbitrary parenthesized boolean
180
- ## expressions, without fully parsing the query, we can't tell whether
181
- ## the user is explicitly directing us to search spam messages or not.
182
- ## e.g. if the string is -(-(-(-(-label:spam)))), does the user want to
183
- ## search spam messages or not?
184
- ##
185
- ## so, we rely on the fact that turning these extra options ON turns OFF
186
- ## the adding of "-label:deleted" or "-label:spam" terms at the very
187
- ## final stage of query processing. if the user wants to search spam
188
- ## messages, not adding that is the right thing; if he doesn't want to
189
- ## search spam messages, then not adding it won't have any effect.
190
- query[:load_spam] = true if subs =~ /\blabel:spam\b/
191
- query[:load_deleted] = true if subs =~ /\blabel:deleted\b/
192
-
193
- ## gmail style "is" operator
194
- subs = subs.gsub(/\b(is|has):(\S+)\b/) do
195
- field, label = $1, $2
196
- case label
197
- when "read"
198
- "-label:unread"
199
- when "spam"
200
- query[:load_spam] = true
201
- "label:spam"
202
- when "deleted"
203
- query[:load_deleted] = true
204
- "label:deleted"
205
- else
206
- "label:#{$2}"
207
- end
208
- end
209
-
210
- ## gmail style attachments "filename" and "filetype" searches
211
- subs = subs.gsub(/\b(filename|filetype):(\((.+?)\)\B|(\S+)\b)/) do
212
- field, name = $1, ($3 || $4)
213
- case field
214
- when "filename"
215
- debug "filename: translated #{field}:#{name} to attachment:\"#{name.downcase}\""
216
- "attachment:\"#{name.downcase}\""
217
- when "filetype"
218
- debug "filetype: translated #{field}:#{name} to attachment_extension:#{name.downcase}"
219
- "attachment_extension:#{name.downcase}"
220
- end
221
- end
222
-
223
- if $have_chronic
224
- lastdate = 2<<32 - 1
225
- firstdate = 0
226
- subs = subs.gsub(/\b(before|on|in|during|after):(\((.+?)\)\B|(\S+)\b)/) do
227
- field, datestr = $1, ($3 || $4)
228
- realdate = Chronic.parse datestr, :guess => false, :context => :past
229
- if realdate
230
- case field
231
- when "after"
232
- debug "chronic: translated #{field}:#{datestr} to #{realdate.end}"
233
- "date:#{realdate.end.to_i}..#{lastdate}"
234
- when "before"
235
- debug "chronic: translated #{field}:#{datestr} to #{realdate.begin}"
236
- "date:#{firstdate}..#{realdate.end.to_i}"
237
- else
238
- debug "chronic: translated #{field}:#{datestr} to #{realdate}"
239
- "date:#{realdate.begin.to_i}..#{realdate.end.to_i}"
240
- end
241
- else
242
- raise ParseError, "can't understand date #{datestr.inspect}"
243
- end
244
- end
245
- end
246
-
247
- ## limit:42 restrict the search to 42 results
248
- subs = subs.gsub(/\blimit:(\S+)\b/) do
249
- lim = $1
250
- if lim =~ /^\d+$/
251
- query[:limit] = lim.to_i
252
- ''
253
- else
254
- raise ParseError, "non-numeric limit #{lim.inspect}"
255
- end
256
- end
257
-
258
- debug "translated query: #{subs.inspect}"
259
-
260
- qp = Xapian::QueryParser.new
261
- qp.database = @xapian
262
- qp.stemmer = Xapian::Stem.new(STEM_LANGUAGE)
263
- qp.stemming_strategy = Xapian::QueryParser::STEM_SOME
264
- qp.default_op = Xapian::Query::OP_AND
265
- qp.add_valuerangeprocessor(Xapian::NumberValueRangeProcessor.new(DATE_VALUENO, 'date:', true))
266
- NORMAL_PREFIX.each { |k,v| qp.add_prefix k, v }
267
- BOOLEAN_PREFIX.each { |k,v| qp.add_boolean_prefix k, v }
268
- xapian_query = qp.parse_query(subs, Xapian::QueryParser::FLAG_PHRASE|Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_WILDCARD, PREFIX['body'])
269
-
270
- debug "parsed xapian query: #{xapian_query.description}"
271
-
272
- raise ParseError if xapian_query.nil? or xapian_query.empty?
273
- query[:qobj] = xapian_query
274
- query[:text] = s
275
- query
276
- end
277
-
278
- private
279
-
280
- # Stemmed
281
- NORMAL_PREFIX = {
282
- 'subject' => 'S',
283
- 'body' => 'B',
284
- 'from_name' => 'FN',
285
- 'to_name' => 'TN',
286
- 'name' => 'N',
287
- 'attachment' => 'A',
288
- }
289
-
290
- # Unstemmed
291
- BOOLEAN_PREFIX = {
292
- 'type' => 'K',
293
- 'from_email' => 'FE',
294
- 'to_email' => 'TE',
295
- 'email' => 'E',
296
- 'date' => 'D',
297
- 'label' => 'L',
298
- 'source_id' => 'I',
299
- 'attachment_extension' => 'O',
300
- 'msgid' => 'Q',
301
- 'thread' => 'H',
302
- 'ref' => 'R',
303
- }
304
-
305
- PREFIX = NORMAL_PREFIX.merge BOOLEAN_PREFIX
306
-
307
- MSGID_VALUENO = 0
308
- THREAD_VALUENO = 1
309
- DATE_VALUENO = 2
310
-
311
- MAX_TERM_LENGTH = 245
312
-
313
- # Xapian can very efficiently sort in ascending docid order. Sup always wants
314
- # to sort by descending date, so this method maps between them. In order to
315
- # handle multiple messages per second, we use a logistic curve centered
316
- # around MIDDLE_DATE so that the slope (docid/s) is greatest in this time
317
- # period. A docid collision is not an error - the code will pick the next
318
- # smallest unused one.
319
- DOCID_SCALE = 2.0**32
320
- TIME_SCALE = 2.0**27
321
- MIDDLE_DATE = Time.gm(2011)
322
- def assign_docid m, truncated_date
323
- t = (truncated_date.to_i - MIDDLE_DATE.to_i).to_f
324
- docid = (DOCID_SCALE - DOCID_SCALE/(Math::E**(-(t/TIME_SCALE)) + 1)).to_i
325
- while docid > 0 and docid_exists? docid
326
- docid -= 1
327
- end
328
- docid > 0 ? docid : nil
329
- end
330
-
331
- # XXX is there a better way?
332
- def docid_exists? docid
333
- begin
334
- @xapian.doclength docid
335
- true
336
- rescue RuntimeError #Xapian::DocNotFoundError
337
- raise unless $!.message =~ /DocNotFoundError/
338
- false
339
- end
340
- end
341
-
342
- def term_docids term
343
- @xapian.postlist(term).map { |x| x.docid }
344
- end
345
-
346
- def find_docid id
347
- docids = term_docids(mkterm(:msgid,id))
348
- fail unless docids.size <= 1
349
- docids.first
350
- end
351
-
352
- def find_doc id
353
- return unless docid = find_docid(id)
354
- @xapian.document docid
355
- end
356
-
357
- def get_id docid
358
- return unless doc = @xapian.document(docid)
359
- doc.value MSGID_VALUENO
360
- end
361
-
362
- def get_entry id
363
- return unless doc = find_doc(id)
364
- Marshal.load doc.data
365
- end
366
-
367
- def thread_killed? thread_id
368
- not run_query(Q.new(Q::OP_AND, mkterm(:thread, thread_id), mkterm(:label, :Killed)), 0, 1).empty?
369
- end
370
-
371
- def synchronize &b
372
- @index_mutex.synchronize &b
373
- end
374
-
375
- def run_query xapian_query, offset, limit, checkatleast=0
376
- synchronize do
377
- @enquire.query = xapian_query
378
- @enquire.mset(offset, limit-offset, checkatleast)
379
- end
380
- end
381
-
382
- def run_query_ids xapian_query, offset, limit
383
- matchset = run_query xapian_query, offset, limit
384
- matchset.matches.map { |r| r.document.value MSGID_VALUENO }
385
- end
386
-
387
- Q = Xapian::Query
388
- def build_xapian_query opts
389
- labels = ([opts[:label]] + (opts[:labels] || [])).compact
390
- neglabels = [:spam, :deleted, :killed].reject { |l| (labels.include? l) || opts.member?("load_#{l}".intern) }
391
- pos_terms, neg_terms = [], []
392
-
393
- pos_terms << mkterm(:type, 'mail')
394
- pos_terms.concat(labels.map { |l| mkterm(:label,l) })
395
- pos_terms << opts[:qobj] if opts[:qobj]
396
- pos_terms << mkterm(:source_id, opts[:source_id]) if opts[:source_id]
397
-
398
- if opts[:participants]
399
- participant_terms = opts[:participants].map { |p| mkterm(:email,:any, (Redwood::Person === p) ? p.email : p) }
400
- pos_terms << Q.new(Q::OP_OR, participant_terms)
401
- end
402
-
403
- neg_terms.concat(neglabels.map { |l| mkterm(:label,l) })
404
-
405
- pos_query = Q.new(Q::OP_AND, pos_terms)
406
- neg_query = Q.new(Q::OP_OR, neg_terms)
407
-
408
- if neg_query.empty?
409
- pos_query
410
- else
411
- Q.new(Q::OP_AND_NOT, [pos_query, neg_query])
412
- end
413
- end
414
-
415
- def sync_message m, overwrite
416
- doc = synchronize { find_doc(m.id) }
417
- existed = doc != nil
418
- doc ||= Xapian::Document.new
419
- do_index_static = overwrite || !existed
420
- old_entry = !do_index_static && doc.entry
421
- snippet = do_index_static ? m.snippet : old_entry[:snippet]
422
-
423
- entry = {
424
- :message_id => m.id,
425
- :source_id => m.source.id,
426
- :source_info => m.source_info,
427
- :date => m.date,
428
- :snippet => snippet,
429
- :labels => m.labels.to_a,
430
- :from => [m.from.email, m.from.name],
431
- :to => m.to.map { |p| [p.email, p.name] },
432
- :cc => m.cc.map { |p| [p.email, p.name] },
433
- :bcc => m.bcc.map { |p| [p.email, p.name] },
434
- :subject => m.subj,
435
- :refs => m.refs.to_a,
436
- :replytos => m.replytos.to_a,
437
- }
438
-
439
- if do_index_static
440
- doc.clear_terms
441
- doc.clear_values
442
- index_message_static m, doc, entry
443
- end
444
-
445
- index_message_threading doc, entry, old_entry
446
- index_message_labels doc, entry[:labels], (do_index_static ? [] : old_entry[:labels])
447
- doc.entry = entry
448
-
449
- synchronize do
450
- unless docid = existed ? doc.docid : assign_docid(m, truncate_date(m.date))
451
- # Could be triggered by spam
452
- warn "docid underflow, dropping #{m.id.inspect}"
453
- return
454
- end
455
- @xapian.replace_document docid, doc
456
- end
457
-
458
- m.labels.each { |l| LabelManager << l }
459
- true
460
- end
461
-
462
- ## Index content that can't be changed by the user
463
- def index_message_static m, doc, entry
464
- # Person names are indexed with several prefixes
465
- person_termer = lambda do |d|
466
- lambda do |p|
467
- ["#{d}_name", "name", "body"].each do |x|
468
- doc.index_text p.name, PREFIX[x]
469
- end if p.name
470
- [d, :any].each { |x| doc.add_term mkterm(:email, x, p.email) }
471
- end
472
- end
473
-
474
- person_termer[:from][m.from] if m.from
475
- (m.to+m.cc+m.bcc).each(&(person_termer[:to]))
476
-
477
- # Full text search content
478
- subject_text = m.indexable_subject
479
- body_text = m.indexable_body
480
- doc.index_text subject_text, PREFIX['subject']
481
- doc.index_text subject_text, PREFIX['body']
482
- doc.index_text body_text, PREFIX['body']
483
- m.attachments.each { |a| doc.index_text a, PREFIX['attachment'] }
484
-
485
- # Miscellaneous terms
486
- doc.add_term mkterm(:date, m.date) if m.date
487
- doc.add_term mkterm(:type, 'mail')
488
- doc.add_term mkterm(:msgid, m.id)
489
- doc.add_term mkterm(:source_id, m.source.id)
490
- m.attachments.each do |a|
491
- a =~ /\.(\w+)$/ or next
492
- doc.add_term mkterm(:attachment_extension, $1)
493
- end
494
-
495
- # Date value for range queries
496
- date_value = begin
497
- Xapian.sortable_serialise m.date.to_i
498
- rescue TypeError
499
- Xapian.sortable_serialise 0
500
- end
501
-
502
- doc.add_value MSGID_VALUENO, m.id
503
- doc.add_value DATE_VALUENO, date_value
504
- end
505
-
506
- def index_message_labels doc, new_labels, old_labels
507
- return if new_labels == old_labels
508
- added = new_labels.to_a - old_labels.to_a
509
- removed = old_labels.to_a - new_labels.to_a
510
- added.each { |t| doc.add_term mkterm(:label,t) }
511
- removed.each { |t| doc.remove_term mkterm(:label,t) }
512
- end
513
-
514
- ## Assign a set of thread ids to the document. This is a hybrid of the runtime
515
- ## search done by the Ferret index and the index-time union done by previous
516
- ## versions of the Xapian index. We first find the thread ids of all messages
517
- ## with a reference to or from us. If that set is empty, we use our own
518
- ## message id. Otherwise, we use all the thread ids we previously found. In
519
- ## the common case there's only one member in that set, but if we're the
520
- ## missing link between multiple previously unrelated threads we can have
521
- ## more. XapianIndex#each_message_in_thread_for follows the thread ids when
522
- ## searching so the user sees a single unified thread.
523
- def index_message_threading doc, entry, old_entry
524
- return if old_entry && (entry[:refs] == old_entry[:refs]) && (entry[:replytos] == old_entry[:replytos])
525
- children = term_docids(mkterm(:ref, entry[:message_id])).map { |docid| @xapian.document docid }
526
- parent_ids = entry[:refs] + entry[:replytos]
527
- parents = parent_ids.map { |id| find_doc id }.compact
528
- thread_members = SavingHash.new { [] }
529
- (children + parents).each do |doc2|
530
- thread_ids = doc2.value(THREAD_VALUENO).split ','
531
- thread_ids.each { |thread_id| thread_members[thread_id] << doc2 }
532
- end
533
- thread_ids = thread_members.empty? ? [entry[:message_id]] : thread_members.keys
534
- thread_ids.each { |thread_id| doc.add_term mkterm(:thread, thread_id) }
535
- parent_ids.each { |ref| doc.add_term mkterm(:ref, ref) }
536
- doc.add_value THREAD_VALUENO, (thread_ids * ',')
537
- end
538
-
539
- def truncate_date date
540
- if date < MIN_DATE
541
- debug "warning: adjusting too-low date #{date} for indexing"
542
- MIN_DATE
543
- elsif date > MAX_DATE
544
- debug "warning: adjusting too-high date #{date} for indexing"
545
- MAX_DATE
546
- else
547
- date
548
- end
549
- end
550
-
551
- # Construct a Xapian term
552
- def mkterm type, *args
553
- case type
554
- when :label
555
- PREFIX['label'] + args[0].to_s.downcase
556
- when :type
557
- PREFIX['type'] + args[0].to_s.downcase
558
- when :date
559
- PREFIX['date'] + args[0].getutc.strftime("%Y%m%d%H%M%S")
560
- when :email
561
- case args[0]
562
- when :from then PREFIX['from_email']
563
- when :to then PREFIX['to_email']
564
- when :any then PREFIX['email']
565
- else raise "Invalid email term type #{args[0]}"
566
- end + args[1].to_s.downcase
567
- when :source_id
568
- PREFIX['source_id'] + args[0].to_s.downcase
569
- when :attachment_extension
570
- PREFIX['attachment_extension'] + args[0].to_s.downcase
571
- when :msgid, :ref, :thread
572
- PREFIX[type.to_s] + args[0][0...(MAX_TERM_LENGTH-1)]
573
- else
574
- raise "Invalid term type #{type}"
575
- end
576
- end
577
- end
578
-
579
- end
580
-
581
- class Xapian::Document
582
- def entry
583
- Marshal.load data
584
- end
585
-
586
- def entry=(x)
587
- self.data = Marshal.dump x
588
- end
589
-
590
- def index_text text, prefix, weight=1
591
- term_generator = Xapian::TermGenerator.new
592
- term_generator.stemmer = Xapian::Stem.new(Redwood::XapianIndex::STEM_LANGUAGE)
593
- term_generator.document = self
594
- term_generator.index_text text, weight, prefix
595
- end
596
-
597
- alias old_add_term add_term
598
- def add_term term
599
- if term.length <= Redwood::XapianIndex::MAX_TERM_LENGTH
600
- old_add_term term, 0
601
- else
602
- warn "dropping excessively long term #{term}"
603
- end
604
- end
605
- end