hobix 0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +18 -0
- data/README +18 -0
- data/Rakefile +96 -0
- data/bin/hobix +94 -0
- data/contrib/blosxom-to-hobix.rb +253 -0
- data/contrib/txp-to-hobix.rb +56 -0
- data/contrib/webrick-all-mine.rb +20 -0
- data/doc/CHANGELOG +285 -0
- data/doc/rdoc/classes/Hobix/API.html +382 -0
- data/doc/rdoc/classes/Hobix/Article.html +111 -0
- data/doc/rdoc/classes/Hobix/BaseContent.html +692 -0
- data/doc/rdoc/classes/Hobix/BaseEntry.html +218 -0
- data/doc/rdoc/classes/Hobix/BaseFacet.html +205 -0
- data/doc/rdoc/classes/Hobix/BaseOutput.html +122 -0
- data/doc/rdoc/classes/Hobix/BasePlugin.html +201 -0
- data/doc/rdoc/classes/Hobix/BaseProperties/ClassMethods.html +243 -0
- data/doc/rdoc/classes/Hobix/BaseProperties.html +218 -0
- data/doc/rdoc/classes/Hobix/BasePublish.html +157 -0
- data/doc/rdoc/classes/Hobix/BaseStorage.html +417 -0
- data/doc/rdoc/classes/Hobix/BixWik/Entry.html +196 -0
- data/doc/rdoc/classes/Hobix/BixWik/IndexEntry.html +170 -0
- data/doc/rdoc/classes/Hobix/BixWik/WikiRedCloth.html +111 -0
- data/doc/rdoc/classes/Hobix/BixWik.html +418 -0
- data/doc/rdoc/classes/Hobix/BixWikPlugin.html +158 -0
- data/doc/rdoc/classes/Hobix/CommandLine.html +1970 -0
- data/doc/rdoc/classes/Hobix/Comment.html +113 -0
- data/doc/rdoc/classes/Hobix/Config.html +212 -0
- data/doc/rdoc/classes/Hobix/DataMarsh.html +667 -0
- data/doc/rdoc/classes/Hobix/Entry.html +178 -0
- data/doc/rdoc/classes/Hobix/EntryEnum.html +162 -0
- data/doc/rdoc/classes/Hobix/Enumerable.html +170 -0
- data/doc/rdoc/classes/Hobix/Facets/WikiEdit.html +180 -0
- data/doc/rdoc/classes/Hobix/Facets.html +111 -0
- data/doc/rdoc/classes/Hobix/LinkList.html +182 -0
- data/doc/rdoc/classes/Hobix/Out/Quick.html +412 -0
- data/doc/rdoc/classes/Hobix/Out.html +119 -0
- data/doc/rdoc/classes/Hobix/Page.html +381 -0
- data/doc/rdoc/classes/Hobix/Trackback.html +113 -0
- data/doc/rdoc/classes/Hobix/UriStr.html +198 -0
- data/doc/rdoc/classes/Hobix/WebApp/QueryString.html +207 -0
- data/doc/rdoc/classes/Hobix/WebApp/QueryValidationFailure.html +111 -0
- data/doc/rdoc/classes/Hobix/WebApp.html +1383 -0
- data/doc/rdoc/classes/Hobix/Weblog/AuthorNotFound.html +111 -0
- data/doc/rdoc/classes/Hobix/Weblog.html +2082 -0
- data/doc/rdoc/classes/Hobix.html +399 -0
- data/doc/rdoc/classes/Kernel.html +139 -0
- data/doc/rdoc/classes/Regexp.html +154 -0
- data/doc/rdoc/classes/YAML/Omap.html +144 -0
- data/doc/rdoc/classes/YAML.html +111 -0
- data/doc/rdoc/created.rid +1 -0
- data/doc/rdoc/files/COPYING.html +129 -0
- data/doc/rdoc/files/README.html +131 -0
- data/doc/rdoc/files/doc/CHANGELOG.html +101 -0
- data/doc/rdoc/files/lib/hobix/api_rb.html +119 -0
- data/doc/rdoc/files/lib/hobix/article_rb.html +126 -0
- data/doc/rdoc/files/lib/hobix/base_rb.html +128 -0
- data/doc/rdoc/files/lib/hobix/bixwik_rb.html +126 -0
- data/doc/rdoc/files/lib/hobix/commandline_rb.html +140 -0
- data/doc/rdoc/files/lib/hobix/comments_rb.html +126 -0
- data/doc/rdoc/files/lib/hobix/config_rb.html +125 -0
- data/doc/rdoc/files/lib/hobix/datamarsh_rb.html +108 -0
- data/doc/rdoc/files/lib/hobix/entry_rb.html +118 -0
- data/doc/rdoc/files/lib/hobix/linklist_rb.html +127 -0
- data/doc/rdoc/files/lib/hobix/publisher_rb.html +126 -0
- data/doc/rdoc/files/lib/hobix/trackbacks_rb.html +128 -0
- data/doc/rdoc/files/lib/hobix/webapp_rb.html +127 -0
- data/doc/rdoc/files/lib/hobix/weblog_rb.html +135 -0
- data/doc/rdoc/files/lib/hobix_rb.html +127 -0
- data/doc/rdoc/fr_class_index.html +67 -0
- data/doc/rdoc/fr_file_index.html +44 -0
- data/doc/rdoc/fr_method_index.html +307 -0
- data/doc/rdoc/index.html +24 -0
- data/doc/rdoc/rdoc-style.css +208 -0
- data/git_hobix_update.php +13 -0
- data/lib/hobix/api.rb +91 -0
- data/lib/hobix/article.rb +22 -0
- data/lib/hobix/base.rb +480 -0
- data/lib/hobix/bixwik.rb +200 -0
- data/lib/hobix/commandline.rb +677 -0
- data/lib/hobix/comments.rb +98 -0
- data/lib/hobix/config.rb +39 -0
- data/lib/hobix/datamarsh.rb +110 -0
- data/lib/hobix/entry.rb +84 -0
- data/lib/hobix/facets/comments.rb +99 -0
- data/lib/hobix/facets/publisher.rb +314 -0
- data/lib/hobix/facets/trackbacks.rb +80 -0
- data/lib/hobix/linklist.rb +81 -0
- data/lib/hobix/out/atom.rb +101 -0
- data/lib/hobix/out/erb.rb +64 -0
- data/lib/hobix/out/okaynews.rb +55 -0
- data/lib/hobix/out/quick.rb +314 -0
- data/lib/hobix/out/rdf.rb +97 -0
- data/lib/hobix/out/redrum.rb +26 -0
- data/lib/hobix/out/rss.rb +128 -0
- data/lib/hobix/plugin/akismet.rb +196 -0
- data/lib/hobix/plugin/bloglines.rb +73 -0
- data/lib/hobix/plugin/calendar.rb +212 -0
- data/lib/hobix/plugin/flickr.rb +110 -0
- data/lib/hobix/plugin/recent_comments.rb +84 -0
- data/lib/hobix/plugin/sections.rb +91 -0
- data/lib/hobix/plugin/tags.rb +60 -0
- data/lib/hobix/publish/ping.rb +53 -0
- data/lib/hobix/publish/replicate.rb +283 -0
- data/lib/hobix/publisher.rb +18 -0
- data/lib/hobix/search/dictionary.rb +141 -0
- data/lib/hobix/search/porter_stemmer.rb +203 -0
- data/lib/hobix/search/simple.rb +209 -0
- data/lib/hobix/search/vector.rb +100 -0
- data/lib/hobix/storage/filesys.rb +408 -0
- data/lib/hobix/trackbacks.rb +93 -0
- data/lib/hobix/util/objedit.rb +193 -0
- data/lib/hobix/util/patcher.rb +155 -0
- data/lib/hobix/webapp/cli.rb +195 -0
- data/lib/hobix/webapp/htmlform.rb +107 -0
- data/lib/hobix/webapp/message.rb +177 -0
- data/lib/hobix/webapp/urigen.rb +141 -0
- data/lib/hobix/webapp/webrick-servlet.rb +90 -0
- data/lib/hobix/webapp.rb +723 -0
- data/lib/hobix/weblog.rb +893 -0
- data/lib/hobix.rb +230 -0
- data/share/default-blog/hobix.yaml +16 -0
- data/share/default-blog/htdocs/site.css +174 -0
- data/share/default-blog/skel/entry.html.quick +0 -0
- data/share/default-blog/skel/index.atom.atom +0 -0
- data/share/default-blog/skel/index.html.quick-summary +0 -0
- data/share/default-blog/skel/index.xml.rss +0 -0
- data/share/default-blog/skel/index.yaml.okaynews +0 -0
- data/share/default-blog/skel/monthly.html.quick-archive +0 -0
- data/share/default-blog/skel/section.html.quick-archive +0 -0
- data/share/default-blog/skel/yearly.html.quick-archive +0 -0
- data/share/default-blog-modes.yaml +7 -0
- data/share/default-blog.apache-cgi.patch +8 -0
- data/share/default-blog.apache-ssi.patch +38 -0
- data/share/default-blog.apache2-ssi.patch +3 -0
- data/share/default-blog.cgi.patch +8 -0
- data/share/default-blog.comments.patch +5 -0
- data/share/default-blog.prototype.patch +766 -0
- data/share/default-blog.publisher.patch +5 -0
- data/share/default-blog.wiki.patch +29 -0
- data/share/publisher/css/control.css +90 -0
- data/share/publisher/css/form.css +238 -0
- data/share/publisher/css/form.import.css +72 -0
- data/share/publisher/css/main-menu.css +134 -0
- data/share/publisher/i/hobix-emblazen-1.png +0 -0
- data/share/publisher/i/hobix-emblazen-2.png +0 -0
- data/share/publisher/i/hobix-emblazen-3.png +0 -0
- data/share/publisher/i/hobix-emblazen-4.png +0 -0
- data/share/publisher/i/hobix-emblazen-5.png +0 -0
- data/share/publisher/i/hobix-emblazen-6.png +0 -0
- data/share/publisher/i/hobix-emblazen-7.png +0 -0
- data/share/publisher/index.erb +66 -0
- data/share/publisher/js/controls.js +261 -0
- data/share/publisher/js/dragdrop.js +476 -0
- data/share/publisher/js/effects.js +570 -0
- data/share/publisher/js/prototype.js +1011 -0
- metadata +230 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
#! /local/ruby/bin/ruby
|
|
2
|
+
#
|
|
3
|
+
# $Id$
|
|
4
|
+
#
|
|
5
|
+
# Lifted from SimpleSearch by Chad Fowler / Dave Thomas / Allen Condit / perhaps other unseeable folks in the distance ...
|
|
6
|
+
#
|
|
7
|
+
# See example usage at the end of this file.
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
module Hobix
|
|
11
|
+
module Stemmable
|
|
12
|
+
|
|
13
|
+
STEMMED = {}
|
|
14
|
+
|
|
15
|
+
STEP_2_LIST = {
|
|
16
|
+
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
|
17
|
+
'izer'=>'ize', 'bli'=>'ble',
|
|
18
|
+
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
|
19
|
+
'ization'=>'ize', 'ation'=>'ate',
|
|
20
|
+
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
|
21
|
+
'ousness'=>'ous', 'aliti'=>'al',
|
|
22
|
+
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
STEP_3_LIST = {
|
|
26
|
+
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
|
27
|
+
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
SUFFIX_1_REGEXP = /(
|
|
32
|
+
ational |
|
|
33
|
+
tional |
|
|
34
|
+
enci |
|
|
35
|
+
anci |
|
|
36
|
+
izer |
|
|
37
|
+
bli |
|
|
38
|
+
alli |
|
|
39
|
+
entli |
|
|
40
|
+
eli |
|
|
41
|
+
ousli |
|
|
42
|
+
ization |
|
|
43
|
+
ation |
|
|
44
|
+
ator |
|
|
45
|
+
alism |
|
|
46
|
+
iveness |
|
|
47
|
+
fulness |
|
|
48
|
+
ousness |
|
|
49
|
+
aliti |
|
|
50
|
+
iviti |
|
|
51
|
+
biliti |
|
|
52
|
+
logi)$/x
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
SUFFIX_2_REGEXP = /(
|
|
56
|
+
al |
|
|
57
|
+
ance |
|
|
58
|
+
ence |
|
|
59
|
+
er |
|
|
60
|
+
ic |
|
|
61
|
+
able |
|
|
62
|
+
ible |
|
|
63
|
+
ant |
|
|
64
|
+
ement |
|
|
65
|
+
ment |
|
|
66
|
+
ent |
|
|
67
|
+
ou |
|
|
68
|
+
ism |
|
|
69
|
+
ate |
|
|
70
|
+
iti |
|
|
71
|
+
ous |
|
|
72
|
+
ive |
|
|
73
|
+
ize)$/x
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
C = "[^aeiou]" # consonant
|
|
77
|
+
V = "[aeiouy]" # vowel
|
|
78
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
|
79
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
|
80
|
+
|
|
81
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
|
82
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
|
83
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
|
84
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
|
85
|
+
|
|
86
|
+
#
|
|
87
|
+
# Porter stemmer in Ruby.
|
|
88
|
+
#
|
|
89
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
|
90
|
+
# version coded up in Perl. It's easy to follow against the rules
|
|
91
|
+
# in the original paper in:
|
|
92
|
+
#
|
|
93
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
|
94
|
+
# no. 3, pp 130-137,
|
|
95
|
+
#
|
|
96
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
|
97
|
+
#
|
|
98
|
+
# Send comments to raypereda@hotmail.com
|
|
99
|
+
#
|
|
100
|
+
|
|
101
|
+
def stem_porter(w = self.to_str.dup)
|
|
102
|
+
|
|
103
|
+
# make a copy of the given object and convert it to a string.
|
|
104
|
+
original_word = w
|
|
105
|
+
|
|
106
|
+
return w if w.length < 3
|
|
107
|
+
|
|
108
|
+
result = STEMMED[w]
|
|
109
|
+
return result if result
|
|
110
|
+
|
|
111
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
|
112
|
+
w[0] = 'Y' if w[0] == ?y
|
|
113
|
+
|
|
114
|
+
# Step 1a
|
|
115
|
+
if w =~ /(ss|i)es$/
|
|
116
|
+
w = $` + $1
|
|
117
|
+
elsif w =~ /([^s])s$/
|
|
118
|
+
w = $` + $1
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Step 1b
|
|
122
|
+
if w =~ /eed$/
|
|
123
|
+
w.chop! if $` =~ MGR0
|
|
124
|
+
elsif w =~ /(ed|ing)$/
|
|
125
|
+
stem = $`
|
|
126
|
+
if stem =~ VOWEL_IN_STEM
|
|
127
|
+
w = stem
|
|
128
|
+
case w
|
|
129
|
+
when /(at|bl|iz)$/ then w << "e"
|
|
130
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
|
131
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
if w =~ /y$/
|
|
137
|
+
stem = $`
|
|
138
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Step 2
|
|
142
|
+
if w =~ SUFFIX_1_REGEXP
|
|
143
|
+
stem = $`
|
|
144
|
+
suffix = $1
|
|
145
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
|
146
|
+
if stem =~ MGR0
|
|
147
|
+
w = stem + STEP_2_LIST[suffix]
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Step 3
|
|
152
|
+
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
|
153
|
+
stem = $`
|
|
154
|
+
suffix = $1
|
|
155
|
+
if stem =~ MGR0
|
|
156
|
+
w = stem + STEP_3_LIST[suffix]
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Step 4
|
|
161
|
+
if w =~ SUFFIX_2_REGEXP
|
|
162
|
+
stem = $`
|
|
163
|
+
if stem =~ MGR1
|
|
164
|
+
w = stem
|
|
165
|
+
end
|
|
166
|
+
elsif w =~ /(s|t)(ion)$/
|
|
167
|
+
stem = $` + $1
|
|
168
|
+
if stem =~ MGR1
|
|
169
|
+
w = stem
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Step 5
|
|
174
|
+
if w =~ /e$/
|
|
175
|
+
stem = $`
|
|
176
|
+
if (stem =~ MGR1) ||
|
|
177
|
+
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
|
178
|
+
w = stem
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
if w =~ /ll$/ && w =~ MGR1
|
|
183
|
+
w.chop!
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# and turn initial Y back to y
|
|
187
|
+
w[0] = 'y' if w[0] == ?Y
|
|
188
|
+
|
|
189
|
+
STEMMED[original_word] = w
|
|
190
|
+
|
|
191
|
+
w
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
module_function :stem_porter
|
|
196
|
+
#
|
|
197
|
+
# make the stem_porter the default stem method, just in case we
|
|
198
|
+
# feel like having multiple stemmers available later.
|
|
199
|
+
#
|
|
200
|
+
alias stem stem_porter
|
|
201
|
+
|
|
202
|
+
end
|
|
203
|
+
end
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
require 'hobix/search/dictionary'
|
|
2
|
+
require 'hobix/search/vector'
|
|
3
|
+
|
|
4
|
+
module Hobix
|
|
5
|
+
module Search
|
|
6
|
+
module Simple
|
|
7
|
+
class Contents < Array
|
|
8
|
+
def latest_mtime
|
|
9
|
+
latest_mtime = Time.at(0)
|
|
10
|
+
each do |item|
|
|
11
|
+
if(item.mtime > latest_mtime)
|
|
12
|
+
latest_mtime = item.mtime
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class Content
|
|
19
|
+
attr_accessor :content, :identifier, :mtime, :classifications
|
|
20
|
+
def initialize(content, identifier, mtime, clsf)
|
|
21
|
+
@content = content
|
|
22
|
+
@identifier = identifier
|
|
23
|
+
@mtime = mtime
|
|
24
|
+
@classifications = clsf
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
SearchResult = Struct.new(:name, :score)
|
|
29
|
+
|
|
30
|
+
class SearchResult
|
|
31
|
+
# enable sort by score
|
|
32
|
+
def <=>(other)
|
|
33
|
+
self.score <=> other.score
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
class SearchResults
|
|
38
|
+
attr_reader :warnings
|
|
39
|
+
attr_reader :results
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def initialize
|
|
43
|
+
@warnings = []
|
|
44
|
+
@results = {}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def add_warning(txt)
|
|
48
|
+
@warnings << txt
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def add_result(name, score)
|
|
52
|
+
@results[name] = SearchResult.new(name, score)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def contains_matches
|
|
56
|
+
!@results.empty?
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Searcher
|
|
62
|
+
|
|
63
|
+
def initialize(dict, document_vectors, cache_file)
|
|
64
|
+
@dict = dict
|
|
65
|
+
@document_vectors = document_vectors
|
|
66
|
+
@cache_file = cache_file
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Return SearchResults based on trying to find the array of
|
|
70
|
+
# +words+ in our document vectors
|
|
71
|
+
#
|
|
72
|
+
# A word beginning '+' _must_ appear in the target documents
|
|
73
|
+
# A word beginning '-' <i>must not</i> appear
|
|
74
|
+
# other words are scored. The documents with the highest
|
|
75
|
+
# scores are returned first
|
|
76
|
+
|
|
77
|
+
def find_words(words)
|
|
78
|
+
search_results = SearchResults.new
|
|
79
|
+
|
|
80
|
+
general = Vector.new
|
|
81
|
+
must_match = Vector.new
|
|
82
|
+
must_not_match = Vector.new
|
|
83
|
+
not_found = false
|
|
84
|
+
|
|
85
|
+
extract_words_for_searcher(words.join(' ')) do |word|
|
|
86
|
+
case word[0]
|
|
87
|
+
when ?+
|
|
88
|
+
word = word[1,99]
|
|
89
|
+
vector = must_match
|
|
90
|
+
when ?-
|
|
91
|
+
word = word[1,99]
|
|
92
|
+
vector = must_not_match
|
|
93
|
+
else
|
|
94
|
+
vector = general
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
index = @dict.find(word.downcase)
|
|
98
|
+
if index
|
|
99
|
+
vector.add_word_index(index)
|
|
100
|
+
else
|
|
101
|
+
not_found = true
|
|
102
|
+
search_results.add_warning "'#{word}' does not occur in the documents"
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
if (general.num_bits + must_match.num_bits).zero?
|
|
107
|
+
search_results.add_warning "No valid search terms given"
|
|
108
|
+
elsif not not_found
|
|
109
|
+
res = []
|
|
110
|
+
@document_vectors.each do |entry, (dvec, mtime)|
|
|
111
|
+
score = dvec.score_against(must_match, must_not_match, general)
|
|
112
|
+
res << [ entry, score ] if score > 0
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
res.sort {|a,b| b[1] <=> a[1] }.each {|name, score|
|
|
116
|
+
search_results.add_result(name, score)
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
search_results.add_warning "No matches" unless search_results.contains_matches
|
|
120
|
+
end
|
|
121
|
+
search_results
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# Serialization support. At some point we'll need to do incremental indexing.
|
|
126
|
+
# For now, however, the following seems to work fairly effectively
|
|
127
|
+
# on 1000 entry blogs, so I'll defer the change until later.
|
|
128
|
+
def Searcher.load(cache_file, wash=false)
|
|
129
|
+
dict = document_vectors = nil
|
|
130
|
+
modified = false
|
|
131
|
+
loaded = false
|
|
132
|
+
begin
|
|
133
|
+
File.open(cache_file, "r") do |f|
|
|
134
|
+
unless wash
|
|
135
|
+
dict = Marshal.load(f)
|
|
136
|
+
document_vectors = Marshal.load(f)
|
|
137
|
+
loaded = true
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
rescue
|
|
141
|
+
;
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
unless loaded
|
|
145
|
+
dict = Dictionary.new
|
|
146
|
+
document_vectors = {}
|
|
147
|
+
modified = true
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
s = Searcher.new(dict, document_vectors, cache_file)
|
|
151
|
+
s.dump if modified
|
|
152
|
+
s
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def dump
|
|
156
|
+
File.open(@cache_file, "w") do |fileInstance|
|
|
157
|
+
Marshal.dump(@dict, fileInstance)
|
|
158
|
+
Marshal.dump(@document_vectors, fileInstance)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def extract_words_for_searcher(text)
|
|
163
|
+
text.scan(/[-+]?\w[\-\w:\\]{2,}/) do |word|
|
|
164
|
+
yield word
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def has_entry? id, mtime
|
|
169
|
+
dvec = @document_vectors[id]
|
|
170
|
+
return true if dvec and dvec.at.to_i >= mtime.to_i
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Create a new dictionary and document vectors from
|
|
174
|
+
# a blog archive
|
|
175
|
+
|
|
176
|
+
def catalog(entry)
|
|
177
|
+
unless has_entry? entry.identifier, entry.mtime
|
|
178
|
+
vector = Vector.new
|
|
179
|
+
vector.at = entry.mtime
|
|
180
|
+
extract_words_for_searcher(entry.content.downcase) do |word|
|
|
181
|
+
word_index = @dict.add_word(word, entry.classifications)
|
|
182
|
+
if word_index
|
|
183
|
+
vector.add_word_index(word_index)
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
@document_vectors[entry.identifier] = vector
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def classifications(text)
|
|
191
|
+
score = Hash.new
|
|
192
|
+
@dict.clsf.each do |category, category_words|
|
|
193
|
+
score[category] = 0
|
|
194
|
+
total = category_words.values.inject(0) {|sum, element| sum+element}
|
|
195
|
+
extract_words_for_searcher(text) do |word|
|
|
196
|
+
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
|
197
|
+
score[category] += Math.log(s/total.to_f)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
score
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def classify(text)
|
|
204
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Maintain a vector of words, where a word is represented by
|
|
2
|
+
# its index in our Dictionary
|
|
3
|
+
#
|
|
4
|
+
module Hobix
|
|
5
|
+
module Search
|
|
6
|
+
module Simple
|
|
7
|
+
class Vector
|
|
8
|
+
|
|
9
|
+
attr_accessor :at
|
|
10
|
+
attr_reader :num_bits, :max_bit, :bits
|
|
11
|
+
|
|
12
|
+
def initialize
|
|
13
|
+
# @bits = []
|
|
14
|
+
@bits = 0
|
|
15
|
+
@max_bit = -1
|
|
16
|
+
@num_bits = 0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def add_word_index(index)
|
|
20
|
+
if @bits[index].zero?
|
|
21
|
+
@bits += (1 << index)
|
|
22
|
+
@num_bits += 1
|
|
23
|
+
@max_bit = index if @max_bit < index
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def dot(vector)
|
|
28
|
+
# We only need to calculate up to the end of the shortest vector
|
|
29
|
+
limit = @max_bit
|
|
30
|
+
# Commenting out the next line makes this vector the dominant
|
|
31
|
+
# one when doing the comparison
|
|
32
|
+
limit = vector.max_bit if limit > vector.max_bit
|
|
33
|
+
|
|
34
|
+
# because both vectors have just ones or zeros in them,
|
|
35
|
+
# we can pre-calculate the AnBn component
|
|
36
|
+
# The vector's magnitude is Sqrt(num set bits)
|
|
37
|
+
factor = Math.sqrt(1.0/@num_bits) * Math.sqrt(1.0/vector.num_bits)
|
|
38
|
+
|
|
39
|
+
count = 0
|
|
40
|
+
(limit+1).times {|i| count += 1 if @bits[i] ==1 && vector.bits[i] == 1}
|
|
41
|
+
|
|
42
|
+
factor * count
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# We're a document's vector, and we're being matched against
|
|
46
|
+
# three other vectors:
|
|
47
|
+
# 1. A list of <i>must match</i> words
|
|
48
|
+
# 2. A list of <i>must not match</i> words
|
|
49
|
+
# 3. A list of general words. The score we return
|
|
50
|
+
# is the number of these that we match
|
|
51
|
+
|
|
52
|
+
def score_against(must_match, must_not_match, general)
|
|
53
|
+
# Eliminate if any _must_not_match_ words found
|
|
54
|
+
unless must_not_match.num_bits.zero?
|
|
55
|
+
return 0 unless (@bits & must_not_match.bits).zero?
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# If the match was entirely negative, then we know we're passed at
|
|
59
|
+
# this point
|
|
60
|
+
|
|
61
|
+
if must_match.num_bits.zero? and general.num_bits.zero?
|
|
62
|
+
return 1
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
count = 0
|
|
66
|
+
|
|
67
|
+
# Eliminate unless all _must_match_ words found
|
|
68
|
+
|
|
69
|
+
unless must_match.num_bits.zero?
|
|
70
|
+
return 0 unless (@bits & must_match.bits) == must_match.bits
|
|
71
|
+
count = 1
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# finally score on the rest
|
|
75
|
+
common = general.bits & @bits
|
|
76
|
+
count += count_bits(common, @max_bit+1) unless common.zero?
|
|
77
|
+
count
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
def count_bits(word, max_bit)
|
|
83
|
+
res = 0
|
|
84
|
+
((max_bit+29)/30).times do |offset|
|
|
85
|
+
x = (word >> (offset*30)) & 0x3fffffff
|
|
86
|
+
next if x.zero?
|
|
87
|
+
x = x - ((x >> 1) & 0x55555555)
|
|
88
|
+
x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
|
|
89
|
+
x = (x + (x >> 4)) & 0x0f0f0f0f;
|
|
90
|
+
x = x + (x >> 8)
|
|
91
|
+
x = x + (x >> 16)
|
|
92
|
+
res += x & 0x3f
|
|
93
|
+
end
|
|
94
|
+
res
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|