hobix 0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +18 -0
- data/README +18 -0
- data/Rakefile +96 -0
- data/bin/hobix +94 -0
- data/contrib/blosxom-to-hobix.rb +253 -0
- data/contrib/txp-to-hobix.rb +56 -0
- data/contrib/webrick-all-mine.rb +20 -0
- data/doc/CHANGELOG +285 -0
- data/doc/rdoc/classes/Hobix/API.html +382 -0
- data/doc/rdoc/classes/Hobix/Article.html +111 -0
- data/doc/rdoc/classes/Hobix/BaseContent.html +692 -0
- data/doc/rdoc/classes/Hobix/BaseEntry.html +218 -0
- data/doc/rdoc/classes/Hobix/BaseFacet.html +205 -0
- data/doc/rdoc/classes/Hobix/BaseOutput.html +122 -0
- data/doc/rdoc/classes/Hobix/BasePlugin.html +201 -0
- data/doc/rdoc/classes/Hobix/BaseProperties/ClassMethods.html +243 -0
- data/doc/rdoc/classes/Hobix/BaseProperties.html +218 -0
- data/doc/rdoc/classes/Hobix/BasePublish.html +157 -0
- data/doc/rdoc/classes/Hobix/BaseStorage.html +417 -0
- data/doc/rdoc/classes/Hobix/BixWik/Entry.html +196 -0
- data/doc/rdoc/classes/Hobix/BixWik/IndexEntry.html +170 -0
- data/doc/rdoc/classes/Hobix/BixWik/WikiRedCloth.html +111 -0
- data/doc/rdoc/classes/Hobix/BixWik.html +418 -0
- data/doc/rdoc/classes/Hobix/BixWikPlugin.html +158 -0
- data/doc/rdoc/classes/Hobix/CommandLine.html +1970 -0
- data/doc/rdoc/classes/Hobix/Comment.html +113 -0
- data/doc/rdoc/classes/Hobix/Config.html +212 -0
- data/doc/rdoc/classes/Hobix/DataMarsh.html +667 -0
- data/doc/rdoc/classes/Hobix/Entry.html +178 -0
- data/doc/rdoc/classes/Hobix/EntryEnum.html +162 -0
- data/doc/rdoc/classes/Hobix/Enumerable.html +170 -0
- data/doc/rdoc/classes/Hobix/Facets/WikiEdit.html +180 -0
- data/doc/rdoc/classes/Hobix/Facets.html +111 -0
- data/doc/rdoc/classes/Hobix/LinkList.html +182 -0
- data/doc/rdoc/classes/Hobix/Out/Quick.html +412 -0
- data/doc/rdoc/classes/Hobix/Out.html +119 -0
- data/doc/rdoc/classes/Hobix/Page.html +381 -0
- data/doc/rdoc/classes/Hobix/Trackback.html +113 -0
- data/doc/rdoc/classes/Hobix/UriStr.html +198 -0
- data/doc/rdoc/classes/Hobix/WebApp/QueryString.html +207 -0
- data/doc/rdoc/classes/Hobix/WebApp/QueryValidationFailure.html +111 -0
- data/doc/rdoc/classes/Hobix/WebApp.html +1383 -0
- data/doc/rdoc/classes/Hobix/Weblog/AuthorNotFound.html +111 -0
- data/doc/rdoc/classes/Hobix/Weblog.html +2082 -0
- data/doc/rdoc/classes/Hobix.html +399 -0
- data/doc/rdoc/classes/Kernel.html +139 -0
- data/doc/rdoc/classes/Regexp.html +154 -0
- data/doc/rdoc/classes/YAML/Omap.html +144 -0
- data/doc/rdoc/classes/YAML.html +111 -0
- data/doc/rdoc/created.rid +1 -0
- data/doc/rdoc/files/COPYING.html +129 -0
- data/doc/rdoc/files/README.html +131 -0
- data/doc/rdoc/files/doc/CHANGELOG.html +101 -0
- data/doc/rdoc/files/lib/hobix/api_rb.html +119 -0
- data/doc/rdoc/files/lib/hobix/article_rb.html +126 -0
- data/doc/rdoc/files/lib/hobix/base_rb.html +128 -0
- data/doc/rdoc/files/lib/hobix/bixwik_rb.html +126 -0
- data/doc/rdoc/files/lib/hobix/commandline_rb.html +140 -0
- data/doc/rdoc/files/lib/hobix/comments_rb.html +126 -0
- data/doc/rdoc/files/lib/hobix/config_rb.html +125 -0
- data/doc/rdoc/files/lib/hobix/datamarsh_rb.html +108 -0
- data/doc/rdoc/files/lib/hobix/entry_rb.html +118 -0
- data/doc/rdoc/files/lib/hobix/linklist_rb.html +127 -0
- data/doc/rdoc/files/lib/hobix/publisher_rb.html +126 -0
- data/doc/rdoc/files/lib/hobix/trackbacks_rb.html +128 -0
- data/doc/rdoc/files/lib/hobix/webapp_rb.html +127 -0
- data/doc/rdoc/files/lib/hobix/weblog_rb.html +135 -0
- data/doc/rdoc/files/lib/hobix_rb.html +127 -0
- data/doc/rdoc/fr_class_index.html +67 -0
- data/doc/rdoc/fr_file_index.html +44 -0
- data/doc/rdoc/fr_method_index.html +307 -0
- data/doc/rdoc/index.html +24 -0
- data/doc/rdoc/rdoc-style.css +208 -0
- data/git_hobix_update.php +13 -0
- data/lib/hobix/api.rb +91 -0
- data/lib/hobix/article.rb +22 -0
- data/lib/hobix/base.rb +480 -0
- data/lib/hobix/bixwik.rb +200 -0
- data/lib/hobix/commandline.rb +677 -0
- data/lib/hobix/comments.rb +98 -0
- data/lib/hobix/config.rb +39 -0
- data/lib/hobix/datamarsh.rb +110 -0
- data/lib/hobix/entry.rb +84 -0
- data/lib/hobix/facets/comments.rb +99 -0
- data/lib/hobix/facets/publisher.rb +314 -0
- data/lib/hobix/facets/trackbacks.rb +80 -0
- data/lib/hobix/linklist.rb +81 -0
- data/lib/hobix/out/atom.rb +101 -0
- data/lib/hobix/out/erb.rb +64 -0
- data/lib/hobix/out/okaynews.rb +55 -0
- data/lib/hobix/out/quick.rb +314 -0
- data/lib/hobix/out/rdf.rb +97 -0
- data/lib/hobix/out/redrum.rb +26 -0
- data/lib/hobix/out/rss.rb +128 -0
- data/lib/hobix/plugin/akismet.rb +196 -0
- data/lib/hobix/plugin/bloglines.rb +73 -0
- data/lib/hobix/plugin/calendar.rb +212 -0
- data/lib/hobix/plugin/flickr.rb +110 -0
- data/lib/hobix/plugin/recent_comments.rb +84 -0
- data/lib/hobix/plugin/sections.rb +91 -0
- data/lib/hobix/plugin/tags.rb +60 -0
- data/lib/hobix/publish/ping.rb +53 -0
- data/lib/hobix/publish/replicate.rb +283 -0
- data/lib/hobix/publisher.rb +18 -0
- data/lib/hobix/search/dictionary.rb +141 -0
- data/lib/hobix/search/porter_stemmer.rb +203 -0
- data/lib/hobix/search/simple.rb +209 -0
- data/lib/hobix/search/vector.rb +100 -0
- data/lib/hobix/storage/filesys.rb +408 -0
- data/lib/hobix/trackbacks.rb +93 -0
- data/lib/hobix/util/objedit.rb +193 -0
- data/lib/hobix/util/patcher.rb +155 -0
- data/lib/hobix/webapp/cli.rb +195 -0
- data/lib/hobix/webapp/htmlform.rb +107 -0
- data/lib/hobix/webapp/message.rb +177 -0
- data/lib/hobix/webapp/urigen.rb +141 -0
- data/lib/hobix/webapp/webrick-servlet.rb +90 -0
- data/lib/hobix/webapp.rb +723 -0
- data/lib/hobix/weblog.rb +893 -0
- data/lib/hobix.rb +230 -0
- data/share/default-blog/hobix.yaml +16 -0
- data/share/default-blog/htdocs/site.css +174 -0
- data/share/default-blog/skel/entry.html.quick +0 -0
- data/share/default-blog/skel/index.atom.atom +0 -0
- data/share/default-blog/skel/index.html.quick-summary +0 -0
- data/share/default-blog/skel/index.xml.rss +0 -0
- data/share/default-blog/skel/index.yaml.okaynews +0 -0
- data/share/default-blog/skel/monthly.html.quick-archive +0 -0
- data/share/default-blog/skel/section.html.quick-archive +0 -0
- data/share/default-blog/skel/yearly.html.quick-archive +0 -0
- data/share/default-blog-modes.yaml +7 -0
- data/share/default-blog.apache-cgi.patch +8 -0
- data/share/default-blog.apache-ssi.patch +38 -0
- data/share/default-blog.apache2-ssi.patch +3 -0
- data/share/default-blog.cgi.patch +8 -0
- data/share/default-blog.comments.patch +5 -0
- data/share/default-blog.prototype.patch +766 -0
- data/share/default-blog.publisher.patch +5 -0
- data/share/default-blog.wiki.patch +29 -0
- data/share/publisher/css/control.css +90 -0
- data/share/publisher/css/form.css +238 -0
- data/share/publisher/css/form.import.css +72 -0
- data/share/publisher/css/main-menu.css +134 -0
- data/share/publisher/i/hobix-emblazen-1.png +0 -0
- data/share/publisher/i/hobix-emblazen-2.png +0 -0
- data/share/publisher/i/hobix-emblazen-3.png +0 -0
- data/share/publisher/i/hobix-emblazen-4.png +0 -0
- data/share/publisher/i/hobix-emblazen-5.png +0 -0
- data/share/publisher/i/hobix-emblazen-6.png +0 -0
- data/share/publisher/i/hobix-emblazen-7.png +0 -0
- data/share/publisher/index.erb +66 -0
- data/share/publisher/js/controls.js +261 -0
- data/share/publisher/js/dragdrop.js +476 -0
- data/share/publisher/js/effects.js +570 -0
- data/share/publisher/js/prototype.js +1011 -0
- metadata +230 -0
@@ -0,0 +1,203 @@
|
|
1
|
+
#! /local/ruby/bin/ruby
|
2
|
+
#
|
3
|
+
# $Id$
|
4
|
+
#
|
5
|
+
# Lifted from SimpleSearch by Chad Fowler / Dave Thomas / Allen Condit / perhaps other unseeable folks in the distance ...
|
6
|
+
#
|
7
|
+
# See example usage at the end of this file.
|
8
|
+
#
|
9
|
+
|
10
|
+
module Hobix
|
11
|
+
module Stemmable
|
12
|
+
|
13
|
+
STEMMED = {}
|
14
|
+
|
15
|
+
STEP_2_LIST = {
|
16
|
+
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
17
|
+
'izer'=>'ize', 'bli'=>'ble',
|
18
|
+
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
19
|
+
'ization'=>'ize', 'ation'=>'ate',
|
20
|
+
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
21
|
+
'ousness'=>'ous', 'aliti'=>'al',
|
22
|
+
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
23
|
+
}
|
24
|
+
|
25
|
+
STEP_3_LIST = {
|
26
|
+
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
27
|
+
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
28
|
+
}
|
29
|
+
|
30
|
+
|
31
|
+
SUFFIX_1_REGEXP = /(
|
32
|
+
ational |
|
33
|
+
tional |
|
34
|
+
enci |
|
35
|
+
anci |
|
36
|
+
izer |
|
37
|
+
bli |
|
38
|
+
alli |
|
39
|
+
entli |
|
40
|
+
eli |
|
41
|
+
ousli |
|
42
|
+
ization |
|
43
|
+
ation |
|
44
|
+
ator |
|
45
|
+
alism |
|
46
|
+
iveness |
|
47
|
+
fulness |
|
48
|
+
ousness |
|
49
|
+
aliti |
|
50
|
+
iviti |
|
51
|
+
biliti |
|
52
|
+
logi)$/x
|
53
|
+
|
54
|
+
|
55
|
+
SUFFIX_2_REGEXP = /(
|
56
|
+
al |
|
57
|
+
ance |
|
58
|
+
ence |
|
59
|
+
er |
|
60
|
+
ic |
|
61
|
+
able |
|
62
|
+
ible |
|
63
|
+
ant |
|
64
|
+
ement |
|
65
|
+
ment |
|
66
|
+
ent |
|
67
|
+
ou |
|
68
|
+
ism |
|
69
|
+
ate |
|
70
|
+
iti |
|
71
|
+
ous |
|
72
|
+
ive |
|
73
|
+
ize)$/x
|
74
|
+
|
75
|
+
|
76
|
+
C = "[^aeiou]" # consonant
|
77
|
+
V = "[aeiouy]" # vowel
|
78
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
79
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
80
|
+
|
81
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
82
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
83
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
84
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
85
|
+
|
86
|
+
#
|
87
|
+
# Porter stemmer in Ruby.
|
88
|
+
#
|
89
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
90
|
+
# version coded up in Perl. It's easy to follow against the rules
|
91
|
+
# in the original paper in:
|
92
|
+
#
|
93
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
94
|
+
# no. 3, pp 130-137,
|
95
|
+
#
|
96
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
97
|
+
#
|
98
|
+
# Send comments to raypereda@hotmail.com
|
99
|
+
#
|
100
|
+
|
101
|
+
def stem_porter(w = self.to_str.dup)
|
102
|
+
|
103
|
+
# make a copy of the given object and convert it to a string.
|
104
|
+
original_word = w
|
105
|
+
|
106
|
+
return w if w.length < 3
|
107
|
+
|
108
|
+
result = STEMMED[w]
|
109
|
+
return result if result
|
110
|
+
|
111
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
112
|
+
w[0] = 'Y' if w[0] == ?y
|
113
|
+
|
114
|
+
# Step 1a
|
115
|
+
if w =~ /(ss|i)es$/
|
116
|
+
w = $` + $1
|
117
|
+
elsif w =~ /([^s])s$/
|
118
|
+
w = $` + $1
|
119
|
+
end
|
120
|
+
|
121
|
+
# Step 1b
|
122
|
+
if w =~ /eed$/
|
123
|
+
w.chop! if $` =~ MGR0
|
124
|
+
elsif w =~ /(ed|ing)$/
|
125
|
+
stem = $`
|
126
|
+
if stem =~ VOWEL_IN_STEM
|
127
|
+
w = stem
|
128
|
+
case w
|
129
|
+
when /(at|bl|iz)$/ then w << "e"
|
130
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
131
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
if w =~ /y$/
|
137
|
+
stem = $`
|
138
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
139
|
+
end
|
140
|
+
|
141
|
+
# Step 2
|
142
|
+
if w =~ SUFFIX_1_REGEXP
|
143
|
+
stem = $`
|
144
|
+
suffix = $1
|
145
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
146
|
+
if stem =~ MGR0
|
147
|
+
w = stem + STEP_2_LIST[suffix]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Step 3
|
152
|
+
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
153
|
+
stem = $`
|
154
|
+
suffix = $1
|
155
|
+
if stem =~ MGR0
|
156
|
+
w = stem + STEP_3_LIST[suffix]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Step 4
|
161
|
+
if w =~ SUFFIX_2_REGEXP
|
162
|
+
stem = $`
|
163
|
+
if stem =~ MGR1
|
164
|
+
w = stem
|
165
|
+
end
|
166
|
+
elsif w =~ /(s|t)(ion)$/
|
167
|
+
stem = $` + $1
|
168
|
+
if stem =~ MGR1
|
169
|
+
w = stem
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# Step 5
|
174
|
+
if w =~ /e$/
|
175
|
+
stem = $`
|
176
|
+
if (stem =~ MGR1) ||
|
177
|
+
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
178
|
+
w = stem
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
if w =~ /ll$/ && w =~ MGR1
|
183
|
+
w.chop!
|
184
|
+
end
|
185
|
+
|
186
|
+
# and turn initial Y back to y
|
187
|
+
w[0] = 'y' if w[0] == ?Y
|
188
|
+
|
189
|
+
STEMMED[original_word] = w
|
190
|
+
|
191
|
+
w
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
module_function :stem_porter
|
196
|
+
#
|
197
|
+
# make the stem_porter the default stem method, just in case we
|
198
|
+
# feel like having multiple stemmers available later.
|
199
|
+
#
|
200
|
+
alias stem stem_porter
|
201
|
+
|
202
|
+
end
|
203
|
+
end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
require 'hobix/search/dictionary'
|
2
|
+
require 'hobix/search/vector'
|
3
|
+
|
4
|
+
module Hobix
|
5
|
+
module Search
|
6
|
+
module Simple
|
7
|
+
class Contents < Array
|
8
|
+
def latest_mtime
|
9
|
+
latest_mtime = Time.at(0)
|
10
|
+
each do |item|
|
11
|
+
if(item.mtime > latest_mtime)
|
12
|
+
latest_mtime = item.mtime
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Content
|
19
|
+
attr_accessor :content, :identifier, :mtime, :classifications
|
20
|
+
def initialize(content, identifier, mtime, clsf)
|
21
|
+
@content = content
|
22
|
+
@identifier = identifier
|
23
|
+
@mtime = mtime
|
24
|
+
@classifications = clsf
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
SearchResult = Struct.new(:name, :score)
|
29
|
+
|
30
|
+
class SearchResult
|
31
|
+
# enable sort by score
|
32
|
+
def <=>(other)
|
33
|
+
self.score <=> other.score
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class SearchResults
|
38
|
+
attr_reader :warnings
|
39
|
+
attr_reader :results
|
40
|
+
|
41
|
+
|
42
|
+
def initialize
|
43
|
+
@warnings = []
|
44
|
+
@results = {}
|
45
|
+
end
|
46
|
+
|
47
|
+
def add_warning(txt)
|
48
|
+
@warnings << txt
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_result(name, score)
|
52
|
+
@results[name] = SearchResult.new(name, score)
|
53
|
+
end
|
54
|
+
|
55
|
+
def contains_matches
|
56
|
+
!@results.empty?
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
class Searcher
|
62
|
+
|
63
|
+
def initialize(dict, document_vectors, cache_file)
|
64
|
+
@dict = dict
|
65
|
+
@document_vectors = document_vectors
|
66
|
+
@cache_file = cache_file
|
67
|
+
end
|
68
|
+
|
69
|
+
# Return SearchResults based on trying to find the array of
|
70
|
+
# +words+ in our document vectors
|
71
|
+
#
|
72
|
+
# A word beginning '+' _must_ appear in the target documents
|
73
|
+
# A word beginning '-' <i>must not</i> appear
|
74
|
+
# other words are scored. The documents with the highest
|
75
|
+
# scores are returned first
|
76
|
+
|
77
|
+
def find_words(words)
|
78
|
+
search_results = SearchResults.new
|
79
|
+
|
80
|
+
general = Vector.new
|
81
|
+
must_match = Vector.new
|
82
|
+
must_not_match = Vector.new
|
83
|
+
not_found = false
|
84
|
+
|
85
|
+
extract_words_for_searcher(words.join(' ')) do |word|
|
86
|
+
case word[0]
|
87
|
+
when ?+
|
88
|
+
word = word[1,99]
|
89
|
+
vector = must_match
|
90
|
+
when ?-
|
91
|
+
word = word[1,99]
|
92
|
+
vector = must_not_match
|
93
|
+
else
|
94
|
+
vector = general
|
95
|
+
end
|
96
|
+
|
97
|
+
index = @dict.find(word.downcase)
|
98
|
+
if index
|
99
|
+
vector.add_word_index(index)
|
100
|
+
else
|
101
|
+
not_found = true
|
102
|
+
search_results.add_warning "'#{word}' does not occur in the documents"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
if (general.num_bits + must_match.num_bits).zero?
|
107
|
+
search_results.add_warning "No valid search terms given"
|
108
|
+
elsif not not_found
|
109
|
+
res = []
|
110
|
+
@document_vectors.each do |entry, (dvec, mtime)|
|
111
|
+
score = dvec.score_against(must_match, must_not_match, general)
|
112
|
+
res << [ entry, score ] if score > 0
|
113
|
+
end
|
114
|
+
|
115
|
+
res.sort {|a,b| b[1] <=> a[1] }.each {|name, score|
|
116
|
+
search_results.add_result(name, score)
|
117
|
+
}
|
118
|
+
|
119
|
+
search_results.add_warning "No matches" unless search_results.contains_matches
|
120
|
+
end
|
121
|
+
search_results
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# Serialization support. At some point we'll need to do incremental indexing.
|
126
|
+
# For now, however, the following seems to work fairly effectively
|
127
|
+
# on 1000 entry blogs, so I'll defer the change until later.
|
128
|
+
def Searcher.load(cache_file, wash=false)
|
129
|
+
dict = document_vectors = nil
|
130
|
+
modified = false
|
131
|
+
loaded = false
|
132
|
+
begin
|
133
|
+
File.open(cache_file, "r") do |f|
|
134
|
+
unless wash
|
135
|
+
dict = Marshal.load(f)
|
136
|
+
document_vectors = Marshal.load(f)
|
137
|
+
loaded = true
|
138
|
+
end
|
139
|
+
end
|
140
|
+
rescue
|
141
|
+
;
|
142
|
+
end
|
143
|
+
|
144
|
+
unless loaded
|
145
|
+
dict = Dictionary.new
|
146
|
+
document_vectors = {}
|
147
|
+
modified = true
|
148
|
+
end
|
149
|
+
|
150
|
+
s = Searcher.new(dict, document_vectors, cache_file)
|
151
|
+
s.dump if modified
|
152
|
+
s
|
153
|
+
end
|
154
|
+
|
155
|
+
def dump
|
156
|
+
File.open(@cache_file, "w") do |fileInstance|
|
157
|
+
Marshal.dump(@dict, fileInstance)
|
158
|
+
Marshal.dump(@document_vectors, fileInstance)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def extract_words_for_searcher(text)
|
163
|
+
text.scan(/[-+]?\w[\-\w:\\]{2,}/) do |word|
|
164
|
+
yield word
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def has_entry? id, mtime
|
169
|
+
dvec = @document_vectors[id]
|
170
|
+
return true if dvec and dvec.at.to_i >= mtime.to_i
|
171
|
+
end
|
172
|
+
|
173
|
+
# Create a new dictionary and document vectors from
|
174
|
+
# a blog archive
|
175
|
+
|
176
|
+
def catalog(entry)
|
177
|
+
unless has_entry? entry.identifier, entry.mtime
|
178
|
+
vector = Vector.new
|
179
|
+
vector.at = entry.mtime
|
180
|
+
extract_words_for_searcher(entry.content.downcase) do |word|
|
181
|
+
word_index = @dict.add_word(word, entry.classifications)
|
182
|
+
if word_index
|
183
|
+
vector.add_word_index(word_index)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
@document_vectors[entry.identifier] = vector
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def classifications(text)
|
191
|
+
score = Hash.new
|
192
|
+
@dict.clsf.each do |category, category_words|
|
193
|
+
score[category] = 0
|
194
|
+
total = category_words.values.inject(0) {|sum, element| sum+element}
|
195
|
+
extract_words_for_searcher(text) do |word|
|
196
|
+
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
197
|
+
score[category] += Math.log(s/total.to_f)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
score
|
201
|
+
end
|
202
|
+
|
203
|
+
def classify(text)
|
204
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# Maintain a vector of words, where a word is represented by
|
2
|
+
# its index in our Dictionary
|
3
|
+
#
|
4
|
+
module Hobix
|
5
|
+
module Search
|
6
|
+
module Simple
|
7
|
+
class Vector
|
8
|
+
|
9
|
+
attr_accessor :at
|
10
|
+
attr_reader :num_bits, :max_bit, :bits
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
# @bits = []
|
14
|
+
@bits = 0
|
15
|
+
@max_bit = -1
|
16
|
+
@num_bits = 0
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_word_index(index)
|
20
|
+
if @bits[index].zero?
|
21
|
+
@bits += (1 << index)
|
22
|
+
@num_bits += 1
|
23
|
+
@max_bit = index if @max_bit < index
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def dot(vector)
|
28
|
+
# We only need to calculate up to the end of the shortest vector
|
29
|
+
limit = @max_bit
|
30
|
+
# Commenting out the next line makes this vector the dominant
|
31
|
+
# one when doing the comparison
|
32
|
+
limit = vector.max_bit if limit > vector.max_bit
|
33
|
+
|
34
|
+
# because both vectors have just ones or zeros in them,
|
35
|
+
# we can pre-calculate the AnBn component
|
36
|
+
# The vector's magnitude is Sqrt(num set bits)
|
37
|
+
factor = Math.sqrt(1.0/@num_bits) * Math.sqrt(1.0/vector.num_bits)
|
38
|
+
|
39
|
+
count = 0
|
40
|
+
(limit+1).times {|i| count += 1 if @bits[i] ==1 && vector.bits[i] == 1}
|
41
|
+
|
42
|
+
factor * count
|
43
|
+
end
|
44
|
+
|
45
|
+
# We're a document's vector, and we're being matched against
|
46
|
+
# three other vectors:
|
47
|
+
# 1. A list of <i>must match</i> words
|
48
|
+
# 2. A list of <i>must not match</i> words
|
49
|
+
# 3. A list of general words. The score we return
|
50
|
+
# is the number of these that we match
|
51
|
+
|
52
|
+
def score_against(must_match, must_not_match, general)
|
53
|
+
# Eliminate if any _must_not_match_ words found
|
54
|
+
unless must_not_match.num_bits.zero?
|
55
|
+
return 0 unless (@bits & must_not_match.bits).zero?
|
56
|
+
end
|
57
|
+
|
58
|
+
# If the match was entirely negative, then we know we're passed at
|
59
|
+
# this point
|
60
|
+
|
61
|
+
if must_match.num_bits.zero? and general.num_bits.zero?
|
62
|
+
return 1
|
63
|
+
end
|
64
|
+
|
65
|
+
count = 0
|
66
|
+
|
67
|
+
# Eliminate unless all _must_match_ words found
|
68
|
+
|
69
|
+
unless must_match.num_bits.zero?
|
70
|
+
return 0 unless (@bits & must_match.bits) == must_match.bits
|
71
|
+
count = 1
|
72
|
+
end
|
73
|
+
|
74
|
+
# finally score on the rest
|
75
|
+
common = general.bits & @bits
|
76
|
+
count += count_bits(common, @max_bit+1) unless common.zero?
|
77
|
+
count
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def count_bits(word, max_bit)
|
83
|
+
res = 0
|
84
|
+
((max_bit+29)/30).times do |offset|
|
85
|
+
x = (word >> (offset*30)) & 0x3fffffff
|
86
|
+
next if x.zero?
|
87
|
+
x = x - ((x >> 1) & 0x55555555)
|
88
|
+
x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
|
89
|
+
x = (x + (x >> 4)) & 0x0f0f0f0f;
|
90
|
+
x = x + (x >> 8)
|
91
|
+
x = x + (x >> 16)
|
92
|
+
res += x & 0x3f
|
93
|
+
end
|
94
|
+
res
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|