hobix 0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (156) hide show
  1. data/COPYING +18 -0
  2. data/README +18 -0
  3. data/Rakefile +96 -0
  4. data/bin/hobix +94 -0
  5. data/contrib/blosxom-to-hobix.rb +253 -0
  6. data/contrib/txp-to-hobix.rb +56 -0
  7. data/contrib/webrick-all-mine.rb +20 -0
  8. data/doc/CHANGELOG +285 -0
  9. data/doc/rdoc/classes/Hobix/API.html +382 -0
  10. data/doc/rdoc/classes/Hobix/Article.html +111 -0
  11. data/doc/rdoc/classes/Hobix/BaseContent.html +692 -0
  12. data/doc/rdoc/classes/Hobix/BaseEntry.html +218 -0
  13. data/doc/rdoc/classes/Hobix/BaseFacet.html +205 -0
  14. data/doc/rdoc/classes/Hobix/BaseOutput.html +122 -0
  15. data/doc/rdoc/classes/Hobix/BasePlugin.html +201 -0
  16. data/doc/rdoc/classes/Hobix/BaseProperties/ClassMethods.html +243 -0
  17. data/doc/rdoc/classes/Hobix/BaseProperties.html +218 -0
  18. data/doc/rdoc/classes/Hobix/BasePublish.html +157 -0
  19. data/doc/rdoc/classes/Hobix/BaseStorage.html +417 -0
  20. data/doc/rdoc/classes/Hobix/BixWik/Entry.html +196 -0
  21. data/doc/rdoc/classes/Hobix/BixWik/IndexEntry.html +170 -0
  22. data/doc/rdoc/classes/Hobix/BixWik/WikiRedCloth.html +111 -0
  23. data/doc/rdoc/classes/Hobix/BixWik.html +418 -0
  24. data/doc/rdoc/classes/Hobix/BixWikPlugin.html +158 -0
  25. data/doc/rdoc/classes/Hobix/CommandLine.html +1970 -0
  26. data/doc/rdoc/classes/Hobix/Comment.html +113 -0
  27. data/doc/rdoc/classes/Hobix/Config.html +212 -0
  28. data/doc/rdoc/classes/Hobix/DataMarsh.html +667 -0
  29. data/doc/rdoc/classes/Hobix/Entry.html +178 -0
  30. data/doc/rdoc/classes/Hobix/EntryEnum.html +162 -0
  31. data/doc/rdoc/classes/Hobix/Enumerable.html +170 -0
  32. data/doc/rdoc/classes/Hobix/Facets/WikiEdit.html +180 -0
  33. data/doc/rdoc/classes/Hobix/Facets.html +111 -0
  34. data/doc/rdoc/classes/Hobix/LinkList.html +182 -0
  35. data/doc/rdoc/classes/Hobix/Out/Quick.html +412 -0
  36. data/doc/rdoc/classes/Hobix/Out.html +119 -0
  37. data/doc/rdoc/classes/Hobix/Page.html +381 -0
  38. data/doc/rdoc/classes/Hobix/Trackback.html +113 -0
  39. data/doc/rdoc/classes/Hobix/UriStr.html +198 -0
  40. data/doc/rdoc/classes/Hobix/WebApp/QueryString.html +207 -0
  41. data/doc/rdoc/classes/Hobix/WebApp/QueryValidationFailure.html +111 -0
  42. data/doc/rdoc/classes/Hobix/WebApp.html +1383 -0
  43. data/doc/rdoc/classes/Hobix/Weblog/AuthorNotFound.html +111 -0
  44. data/doc/rdoc/classes/Hobix/Weblog.html +2082 -0
  45. data/doc/rdoc/classes/Hobix.html +399 -0
  46. data/doc/rdoc/classes/Kernel.html +139 -0
  47. data/doc/rdoc/classes/Regexp.html +154 -0
  48. data/doc/rdoc/classes/YAML/Omap.html +144 -0
  49. data/doc/rdoc/classes/YAML.html +111 -0
  50. data/doc/rdoc/created.rid +1 -0
  51. data/doc/rdoc/files/COPYING.html +129 -0
  52. data/doc/rdoc/files/README.html +131 -0
  53. data/doc/rdoc/files/doc/CHANGELOG.html +101 -0
  54. data/doc/rdoc/files/lib/hobix/api_rb.html +119 -0
  55. data/doc/rdoc/files/lib/hobix/article_rb.html +126 -0
  56. data/doc/rdoc/files/lib/hobix/base_rb.html +128 -0
  57. data/doc/rdoc/files/lib/hobix/bixwik_rb.html +126 -0
  58. data/doc/rdoc/files/lib/hobix/commandline_rb.html +140 -0
  59. data/doc/rdoc/files/lib/hobix/comments_rb.html +126 -0
  60. data/doc/rdoc/files/lib/hobix/config_rb.html +125 -0
  61. data/doc/rdoc/files/lib/hobix/datamarsh_rb.html +108 -0
  62. data/doc/rdoc/files/lib/hobix/entry_rb.html +118 -0
  63. data/doc/rdoc/files/lib/hobix/linklist_rb.html +127 -0
  64. data/doc/rdoc/files/lib/hobix/publisher_rb.html +126 -0
  65. data/doc/rdoc/files/lib/hobix/trackbacks_rb.html +128 -0
  66. data/doc/rdoc/files/lib/hobix/webapp_rb.html +127 -0
  67. data/doc/rdoc/files/lib/hobix/weblog_rb.html +135 -0
  68. data/doc/rdoc/files/lib/hobix_rb.html +127 -0
  69. data/doc/rdoc/fr_class_index.html +67 -0
  70. data/doc/rdoc/fr_file_index.html +44 -0
  71. data/doc/rdoc/fr_method_index.html +307 -0
  72. data/doc/rdoc/index.html +24 -0
  73. data/doc/rdoc/rdoc-style.css +208 -0
  74. data/git_hobix_update.php +13 -0
  75. data/lib/hobix/api.rb +91 -0
  76. data/lib/hobix/article.rb +22 -0
  77. data/lib/hobix/base.rb +480 -0
  78. data/lib/hobix/bixwik.rb +200 -0
  79. data/lib/hobix/commandline.rb +677 -0
  80. data/lib/hobix/comments.rb +98 -0
  81. data/lib/hobix/config.rb +39 -0
  82. data/lib/hobix/datamarsh.rb +110 -0
  83. data/lib/hobix/entry.rb +84 -0
  84. data/lib/hobix/facets/comments.rb +99 -0
  85. data/lib/hobix/facets/publisher.rb +314 -0
  86. data/lib/hobix/facets/trackbacks.rb +80 -0
  87. data/lib/hobix/linklist.rb +81 -0
  88. data/lib/hobix/out/atom.rb +101 -0
  89. data/lib/hobix/out/erb.rb +64 -0
  90. data/lib/hobix/out/okaynews.rb +55 -0
  91. data/lib/hobix/out/quick.rb +314 -0
  92. data/lib/hobix/out/rdf.rb +97 -0
  93. data/lib/hobix/out/redrum.rb +26 -0
  94. data/lib/hobix/out/rss.rb +128 -0
  95. data/lib/hobix/plugin/akismet.rb +196 -0
  96. data/lib/hobix/plugin/bloglines.rb +73 -0
  97. data/lib/hobix/plugin/calendar.rb +212 -0
  98. data/lib/hobix/plugin/flickr.rb +110 -0
  99. data/lib/hobix/plugin/recent_comments.rb +84 -0
  100. data/lib/hobix/plugin/sections.rb +91 -0
  101. data/lib/hobix/plugin/tags.rb +60 -0
  102. data/lib/hobix/publish/ping.rb +53 -0
  103. data/lib/hobix/publish/replicate.rb +283 -0
  104. data/lib/hobix/publisher.rb +18 -0
  105. data/lib/hobix/search/dictionary.rb +141 -0
  106. data/lib/hobix/search/porter_stemmer.rb +203 -0
  107. data/lib/hobix/search/simple.rb +209 -0
  108. data/lib/hobix/search/vector.rb +100 -0
  109. data/lib/hobix/storage/filesys.rb +408 -0
  110. data/lib/hobix/trackbacks.rb +93 -0
  111. data/lib/hobix/util/objedit.rb +193 -0
  112. data/lib/hobix/util/patcher.rb +155 -0
  113. data/lib/hobix/webapp/cli.rb +195 -0
  114. data/lib/hobix/webapp/htmlform.rb +107 -0
  115. data/lib/hobix/webapp/message.rb +177 -0
  116. data/lib/hobix/webapp/urigen.rb +141 -0
  117. data/lib/hobix/webapp/webrick-servlet.rb +90 -0
  118. data/lib/hobix/webapp.rb +723 -0
  119. data/lib/hobix/weblog.rb +893 -0
  120. data/lib/hobix.rb +230 -0
  121. data/share/default-blog/hobix.yaml +16 -0
  122. data/share/default-blog/htdocs/site.css +174 -0
  123. data/share/default-blog/skel/entry.html.quick +0 -0
  124. data/share/default-blog/skel/index.atom.atom +0 -0
  125. data/share/default-blog/skel/index.html.quick-summary +0 -0
  126. data/share/default-blog/skel/index.xml.rss +0 -0
  127. data/share/default-blog/skel/index.yaml.okaynews +0 -0
  128. data/share/default-blog/skel/monthly.html.quick-archive +0 -0
  129. data/share/default-blog/skel/section.html.quick-archive +0 -0
  130. data/share/default-blog/skel/yearly.html.quick-archive +0 -0
  131. data/share/default-blog-modes.yaml +7 -0
  132. data/share/default-blog.apache-cgi.patch +8 -0
  133. data/share/default-blog.apache-ssi.patch +38 -0
  134. data/share/default-blog.apache2-ssi.patch +3 -0
  135. data/share/default-blog.cgi.patch +8 -0
  136. data/share/default-blog.comments.patch +5 -0
  137. data/share/default-blog.prototype.patch +766 -0
  138. data/share/default-blog.publisher.patch +5 -0
  139. data/share/default-blog.wiki.patch +29 -0
  140. data/share/publisher/css/control.css +90 -0
  141. data/share/publisher/css/form.css +238 -0
  142. data/share/publisher/css/form.import.css +72 -0
  143. data/share/publisher/css/main-menu.css +134 -0
  144. data/share/publisher/i/hobix-emblazen-1.png +0 -0
  145. data/share/publisher/i/hobix-emblazen-2.png +0 -0
  146. data/share/publisher/i/hobix-emblazen-3.png +0 -0
  147. data/share/publisher/i/hobix-emblazen-4.png +0 -0
  148. data/share/publisher/i/hobix-emblazen-5.png +0 -0
  149. data/share/publisher/i/hobix-emblazen-6.png +0 -0
  150. data/share/publisher/i/hobix-emblazen-7.png +0 -0
  151. data/share/publisher/index.erb +66 -0
  152. data/share/publisher/js/controls.js +261 -0
  153. data/share/publisher/js/dragdrop.js +476 -0
  154. data/share/publisher/js/effects.js +570 -0
  155. data/share/publisher/js/prototype.js +1011 -0
  156. metadata +230 -0
@@ -0,0 +1,203 @@
1
+ #! /local/ruby/bin/ruby
2
+ #
3
+ # $Id$
4
+ #
5
+ # Lifted from SimpleSearch by Chad Fowler / Dave Thomas / Allen Condit / perhaps other unseeable folks in the distance ...
6
+ #
7
+ # See example usage at the end of this file.
8
+ #
9
+
10
+ module Hobix
11
+ module Stemmable
12
+
13
+ STEMMED = {}
14
+
15
+ STEP_2_LIST = {
16
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
17
+ 'izer'=>'ize', 'bli'=>'ble',
18
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
19
+ 'ization'=>'ize', 'ation'=>'ate',
20
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
21
+ 'ousness'=>'ous', 'aliti'=>'al',
22
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
23
+ }
24
+
25
+ STEP_3_LIST = {
26
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
27
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
28
+ }
29
+
30
+
31
+ SUFFIX_1_REGEXP = /(
32
+ ational |
33
+ tional |
34
+ enci |
35
+ anci |
36
+ izer |
37
+ bli |
38
+ alli |
39
+ entli |
40
+ eli |
41
+ ousli |
42
+ ization |
43
+ ation |
44
+ ator |
45
+ alism |
46
+ iveness |
47
+ fulness |
48
+ ousness |
49
+ aliti |
50
+ iviti |
51
+ biliti |
52
+ logi)$/x
53
+
54
+
55
+ SUFFIX_2_REGEXP = /(
56
+ al |
57
+ ance |
58
+ ence |
59
+ er |
60
+ ic |
61
+ able |
62
+ ible |
63
+ ant |
64
+ ement |
65
+ ment |
66
+ ent |
67
+ ou |
68
+ ism |
69
+ ate |
70
+ iti |
71
+ ous |
72
+ ive |
73
+ ize)$/x
74
+
75
+
76
+ C = "[^aeiou]" # consonant
77
+ V = "[aeiouy]" # vowel
78
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
79
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
80
+
81
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
82
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
83
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
84
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
85
+
86
+ #
87
+ # Porter stemmer in Ruby.
88
+ #
89
+ # This is the Porter stemming algorithm, ported to Ruby from the
90
+ # version coded up in Perl. It's easy to follow against the rules
91
+ # in the original paper in:
92
+ #
93
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
94
+ # no. 3, pp 130-137,
95
+ #
96
+ # See also http://www.tartarus.org/~martin/PorterStemmer
97
+ #
98
+ # Send comments to raypereda@hotmail.com
99
+ #
100
+
101
+ def stem_porter(w = self.to_str.dup)
102
+
103
+ # make a copy of the given object and convert it to a string.
104
+ original_word = w
105
+
106
+ return w if w.length < 3
107
+
108
+ result = STEMMED[w]
109
+ return result if result
110
+
111
+ # now map initial y to Y so that the patterns never treat it as vowel
112
+ w[0] = 'Y' if w[0] == ?y
113
+
114
+ # Step 1a
115
+ if w =~ /(ss|i)es$/
116
+ w = $` + $1
117
+ elsif w =~ /([^s])s$/
118
+ w = $` + $1
119
+ end
120
+
121
+ # Step 1b
122
+ if w =~ /eed$/
123
+ w.chop! if $` =~ MGR0
124
+ elsif w =~ /(ed|ing)$/
125
+ stem = $`
126
+ if stem =~ VOWEL_IN_STEM
127
+ w = stem
128
+ case w
129
+ when /(at|bl|iz)$/ then w << "e"
130
+ when /([^aeiouylsz])\1$/ then w.chop!
131
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
132
+ end
133
+ end
134
+ end
135
+
136
+ if w =~ /y$/
137
+ stem = $`
138
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
139
+ end
140
+
141
+ # Step 2
142
+ if w =~ SUFFIX_1_REGEXP
143
+ stem = $`
144
+ suffix = $1
145
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
146
+ if stem =~ MGR0
147
+ w = stem + STEP_2_LIST[suffix]
148
+ end
149
+ end
150
+
151
+ # Step 3
152
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
153
+ stem = $`
154
+ suffix = $1
155
+ if stem =~ MGR0
156
+ w = stem + STEP_3_LIST[suffix]
157
+ end
158
+ end
159
+
160
+ # Step 4
161
+ if w =~ SUFFIX_2_REGEXP
162
+ stem = $`
163
+ if stem =~ MGR1
164
+ w = stem
165
+ end
166
+ elsif w =~ /(s|t)(ion)$/
167
+ stem = $` + $1
168
+ if stem =~ MGR1
169
+ w = stem
170
+ end
171
+ end
172
+
173
+ # Step 5
174
+ if w =~ /e$/
175
+ stem = $`
176
+ if (stem =~ MGR1) ||
177
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
178
+ w = stem
179
+ end
180
+ end
181
+
182
+ if w =~ /ll$/ && w =~ MGR1
183
+ w.chop!
184
+ end
185
+
186
+ # and turn initial Y back to y
187
+ w[0] = 'y' if w[0] == ?Y
188
+
189
+ STEMMED[original_word] = w
190
+
191
+ w
192
+ end
193
+
194
+
195
+ module_function :stem_porter
196
+ #
197
+ # make the stem_porter the default stem method, just in case we
198
+ # feel like having multiple stemmers available later.
199
+ #
200
+ alias stem stem_porter
201
+
202
+ end
203
+ end
@@ -0,0 +1,209 @@
1
+ require 'hobix/search/dictionary'
2
+ require 'hobix/search/vector'
3
+
4
+ module Hobix
5
+ module Search
6
+ module Simple
7
+ class Contents < Array
8
+ def latest_mtime
9
+ latest_mtime = Time.at(0)
10
+ each do |item|
11
+ if(item.mtime > latest_mtime)
12
+ latest_mtime = item.mtime
13
+ end
14
+ end
15
+ end
16
+ end
17
+
18
+ class Content
19
+ attr_accessor :content, :identifier, :mtime, :classifications
20
+ def initialize(content, identifier, mtime, clsf)
21
+ @content = content
22
+ @identifier = identifier
23
+ @mtime = mtime
24
+ @classifications = clsf
25
+ end
26
+ end
27
+
28
+ SearchResult = Struct.new(:name, :score)
29
+
30
+ class SearchResult
31
+ # enable sort by score
32
+ def <=>(other)
33
+ self.score <=> other.score
34
+ end
35
+ end
36
+
37
+ class SearchResults
38
+ attr_reader :warnings
39
+ attr_reader :results
40
+
41
+
42
+ def initialize
43
+ @warnings = []
44
+ @results = {}
45
+ end
46
+
47
+ def add_warning(txt)
48
+ @warnings << txt
49
+ end
50
+
51
+ def add_result(name, score)
52
+ @results[name] = SearchResult.new(name, score)
53
+ end
54
+
55
+ def contains_matches
56
+ !@results.empty?
57
+ end
58
+ end
59
+
60
+
61
+ class Searcher
62
+
63
+ def initialize(dict, document_vectors, cache_file)
64
+ @dict = dict
65
+ @document_vectors = document_vectors
66
+ @cache_file = cache_file
67
+ end
68
+
69
+ # Return SearchResults based on trying to find the array of
70
+ # +words+ in our document vectors
71
+ #
72
+ # A word beginning '+' _must_ appear in the target documents
73
+ # A word beginning '-' <i>must not</i> appear
74
+ # other words are scored. The documents with the highest
75
+ # scores are returned first
76
+
77
+ def find_words(words)
78
+ search_results = SearchResults.new
79
+
80
+ general = Vector.new
81
+ must_match = Vector.new
82
+ must_not_match = Vector.new
83
+ not_found = false
84
+
85
+ extract_words_for_searcher(words.join(' ')) do |word|
86
+ case word[0]
87
+ when ?+
88
+ word = word[1,99]
89
+ vector = must_match
90
+ when ?-
91
+ word = word[1,99]
92
+ vector = must_not_match
93
+ else
94
+ vector = general
95
+ end
96
+
97
+ index = @dict.find(word.downcase)
98
+ if index
99
+ vector.add_word_index(index)
100
+ else
101
+ not_found = true
102
+ search_results.add_warning "'#{word}' does not occur in the documents"
103
+ end
104
+ end
105
+
106
+ if (general.num_bits + must_match.num_bits).zero?
107
+ search_results.add_warning "No valid search terms given"
108
+ elsif not not_found
109
+ res = []
110
+ @document_vectors.each do |entry, (dvec, mtime)|
111
+ score = dvec.score_against(must_match, must_not_match, general)
112
+ res << [ entry, score ] if score > 0
113
+ end
114
+
115
+ res.sort {|a,b| b[1] <=> a[1] }.each {|name, score|
116
+ search_results.add_result(name, score)
117
+ }
118
+
119
+ search_results.add_warning "No matches" unless search_results.contains_matches
120
+ end
121
+ search_results
122
+ end
123
+
124
+
125
+ # Serialization support. At some point we'll need to do incremental indexing.
126
+ # For now, however, the following seems to work fairly effectively
127
+ # on 1000 entry blogs, so I'll defer the change until later.
128
+ def Searcher.load(cache_file, wash=false)
129
+ dict = document_vectors = nil
130
+ modified = false
131
+ loaded = false
132
+ begin
133
+ File.open(cache_file, "r") do |f|
134
+ unless wash
135
+ dict = Marshal.load(f)
136
+ document_vectors = Marshal.load(f)
137
+ loaded = true
138
+ end
139
+ end
140
+ rescue
141
+ ;
142
+ end
143
+
144
+ unless loaded
145
+ dict = Dictionary.new
146
+ document_vectors = {}
147
+ modified = true
148
+ end
149
+
150
+ s = Searcher.new(dict, document_vectors, cache_file)
151
+ s.dump if modified
152
+ s
153
+ end
154
+
155
+ def dump
156
+ File.open(@cache_file, "w") do |fileInstance|
157
+ Marshal.dump(@dict, fileInstance)
158
+ Marshal.dump(@document_vectors, fileInstance)
159
+ end
160
+ end
161
+
162
+ def extract_words_for_searcher(text)
163
+ text.scan(/[-+]?\w[\-\w:\\]{2,}/) do |word|
164
+ yield word
165
+ end
166
+ end
167
+
168
+ def has_entry? id, mtime
169
+ dvec = @document_vectors[id]
170
+ return true if dvec and dvec.at.to_i >= mtime.to_i
171
+ end
172
+
173
+ # Create a new dictionary and document vectors from
174
+ # a blog archive
175
+
176
+ def catalog(entry)
177
+ unless has_entry? entry.identifier, entry.mtime
178
+ vector = Vector.new
179
+ vector.at = entry.mtime
180
+ extract_words_for_searcher(entry.content.downcase) do |word|
181
+ word_index = @dict.add_word(word, entry.classifications)
182
+ if word_index
183
+ vector.add_word_index(word_index)
184
+ end
185
+ end
186
+ @document_vectors[entry.identifier] = vector
187
+ end
188
+ end
189
+
190
+ def classifications(text)
191
+ score = Hash.new
192
+ @dict.clsf.each do |category, category_words|
193
+ score[category] = 0
194
+ total = category_words.values.inject(0) {|sum, element| sum+element}
195
+ extract_words_for_searcher(text) do |word|
196
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
197
+ score[category] += Math.log(s/total.to_f)
198
+ end
199
+ end
200
+ score
201
+ end
202
+
203
+ def classify(text)
204
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,100 @@
1
+ # Maintain a vector of words, where a word is represented by
2
+ # its index in our Dictionary
3
+ #
4
+ module Hobix
5
+ module Search
6
+ module Simple
7
+ class Vector
8
+
9
+ attr_accessor :at
10
+ attr_reader :num_bits, :max_bit, :bits
11
+
12
+ def initialize
13
+ # @bits = []
14
+ @bits = 0
15
+ @max_bit = -1
16
+ @num_bits = 0
17
+ end
18
+
19
+ def add_word_index(index)
20
+ if @bits[index].zero?
21
+ @bits += (1 << index)
22
+ @num_bits += 1
23
+ @max_bit = index if @max_bit < index
24
+ end
25
+ end
26
+
27
+ def dot(vector)
28
+ # We only need to calculate up to the end of the shortest vector
29
+ limit = @max_bit
30
+ # Commenting out the next line makes this vector the dominant
31
+ # one when doing the comparison
32
+ limit = vector.max_bit if limit > vector.max_bit
33
+
34
+ # because both vectors have just ones or zeros in them,
35
+ # we can pre-calculate the AnBn component
36
+ # The vector's magnitude is Sqrt(num set bits)
37
+ factor = Math.sqrt(1.0/@num_bits) * Math.sqrt(1.0/vector.num_bits)
38
+
39
+ count = 0
40
+ (limit+1).times {|i| count += 1 if @bits[i] ==1 && vector.bits[i] == 1}
41
+
42
+ factor * count
43
+ end
44
+
45
+ # We're a document's vector, and we're being matched against
46
+ # three other vectors:
47
+ # 1. A list of <i>must match</i> words
48
+ # 2. A list of <i>must not match</i> words
49
+ # 3. A list of general words. The score we return
50
+ # is the number of these that we match
51
+
52
+ def score_against(must_match, must_not_match, general)
53
+ # Eliminate if any _must_not_match_ words found
54
+ unless must_not_match.num_bits.zero?
55
+ return 0 unless (@bits & must_not_match.bits).zero?
56
+ end
57
+
58
+ # If the match was entirely negative, then we know we're passed at
59
+ # this point
60
+
61
+ if must_match.num_bits.zero? and general.num_bits.zero?
62
+ return 1
63
+ end
64
+
65
+ count = 0
66
+
67
+ # Eliminate unless all _must_match_ words found
68
+
69
+ unless must_match.num_bits.zero?
70
+ return 0 unless (@bits & must_match.bits) == must_match.bits
71
+ count = 1
72
+ end
73
+
74
+ # finally score on the rest
75
+ common = general.bits & @bits
76
+ count += count_bits(common, @max_bit+1) unless common.zero?
77
+ count
78
+ end
79
+
80
+ private
81
+
82
+ def count_bits(word, max_bit)
83
+ res = 0
84
+ ((max_bit+29)/30).times do |offset|
85
+ x = (word >> (offset*30)) & 0x3fffffff
86
+ next if x.zero?
87
+ x = x - ((x >> 1) & 0x55555555)
88
+ x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
89
+ x = (x + (x >> 4)) & 0x0f0f0f0f;
90
+ x = x + (x >> 8)
91
+ x = x + (x >> 16)
92
+ res += x & 0x3f
93
+ end
94
+ res
95
+ end
96
+
97
+ end
98
+ end
99
+ end
100
+ end