hobix 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. data/COPYING +18 -0
  2. data/README +18 -0
  3. data/Rakefile +96 -0
  4. data/bin/hobix +94 -0
  5. data/contrib/blosxom-to-hobix.rb +253 -0
  6. data/contrib/txp-to-hobix.rb +56 -0
  7. data/contrib/webrick-all-mine.rb +20 -0
  8. data/doc/CHANGELOG +285 -0
  9. data/doc/rdoc/classes/Hobix/API.html +382 -0
  10. data/doc/rdoc/classes/Hobix/Article.html +111 -0
  11. data/doc/rdoc/classes/Hobix/BaseContent.html +692 -0
  12. data/doc/rdoc/classes/Hobix/BaseEntry.html +218 -0
  13. data/doc/rdoc/classes/Hobix/BaseFacet.html +205 -0
  14. data/doc/rdoc/classes/Hobix/BaseOutput.html +122 -0
  15. data/doc/rdoc/classes/Hobix/BasePlugin.html +201 -0
  16. data/doc/rdoc/classes/Hobix/BaseProperties/ClassMethods.html +243 -0
  17. data/doc/rdoc/classes/Hobix/BaseProperties.html +218 -0
  18. data/doc/rdoc/classes/Hobix/BasePublish.html +157 -0
  19. data/doc/rdoc/classes/Hobix/BaseStorage.html +417 -0
  20. data/doc/rdoc/classes/Hobix/BixWik/Entry.html +196 -0
  21. data/doc/rdoc/classes/Hobix/BixWik/IndexEntry.html +170 -0
  22. data/doc/rdoc/classes/Hobix/BixWik/WikiRedCloth.html +111 -0
  23. data/doc/rdoc/classes/Hobix/BixWik.html +418 -0
  24. data/doc/rdoc/classes/Hobix/BixWikPlugin.html +158 -0
  25. data/doc/rdoc/classes/Hobix/CommandLine.html +1970 -0
  26. data/doc/rdoc/classes/Hobix/Comment.html +113 -0
  27. data/doc/rdoc/classes/Hobix/Config.html +212 -0
  28. data/doc/rdoc/classes/Hobix/DataMarsh.html +667 -0
  29. data/doc/rdoc/classes/Hobix/Entry.html +178 -0
  30. data/doc/rdoc/classes/Hobix/EntryEnum.html +162 -0
  31. data/doc/rdoc/classes/Hobix/Enumerable.html +170 -0
  32. data/doc/rdoc/classes/Hobix/Facets/WikiEdit.html +180 -0
  33. data/doc/rdoc/classes/Hobix/Facets.html +111 -0
  34. data/doc/rdoc/classes/Hobix/LinkList.html +182 -0
  35. data/doc/rdoc/classes/Hobix/Out/Quick.html +412 -0
  36. data/doc/rdoc/classes/Hobix/Out.html +119 -0
  37. data/doc/rdoc/classes/Hobix/Page.html +381 -0
  38. data/doc/rdoc/classes/Hobix/Trackback.html +113 -0
  39. data/doc/rdoc/classes/Hobix/UriStr.html +198 -0
  40. data/doc/rdoc/classes/Hobix/WebApp/QueryString.html +207 -0
  41. data/doc/rdoc/classes/Hobix/WebApp/QueryValidationFailure.html +111 -0
  42. data/doc/rdoc/classes/Hobix/WebApp.html +1383 -0
  43. data/doc/rdoc/classes/Hobix/Weblog/AuthorNotFound.html +111 -0
  44. data/doc/rdoc/classes/Hobix/Weblog.html +2082 -0
  45. data/doc/rdoc/classes/Hobix.html +399 -0
  46. data/doc/rdoc/classes/Kernel.html +139 -0
  47. data/doc/rdoc/classes/Regexp.html +154 -0
  48. data/doc/rdoc/classes/YAML/Omap.html +144 -0
  49. data/doc/rdoc/classes/YAML.html +111 -0
  50. data/doc/rdoc/created.rid +1 -0
  51. data/doc/rdoc/files/COPYING.html +129 -0
  52. data/doc/rdoc/files/README.html +131 -0
  53. data/doc/rdoc/files/doc/CHANGELOG.html +101 -0
  54. data/doc/rdoc/files/lib/hobix/api_rb.html +119 -0
  55. data/doc/rdoc/files/lib/hobix/article_rb.html +126 -0
  56. data/doc/rdoc/files/lib/hobix/base_rb.html +128 -0
  57. data/doc/rdoc/files/lib/hobix/bixwik_rb.html +126 -0
  58. data/doc/rdoc/files/lib/hobix/commandline_rb.html +140 -0
  59. data/doc/rdoc/files/lib/hobix/comments_rb.html +126 -0
  60. data/doc/rdoc/files/lib/hobix/config_rb.html +125 -0
  61. data/doc/rdoc/files/lib/hobix/datamarsh_rb.html +108 -0
  62. data/doc/rdoc/files/lib/hobix/entry_rb.html +118 -0
  63. data/doc/rdoc/files/lib/hobix/linklist_rb.html +127 -0
  64. data/doc/rdoc/files/lib/hobix/publisher_rb.html +126 -0
  65. data/doc/rdoc/files/lib/hobix/trackbacks_rb.html +128 -0
  66. data/doc/rdoc/files/lib/hobix/webapp_rb.html +127 -0
  67. data/doc/rdoc/files/lib/hobix/weblog_rb.html +135 -0
  68. data/doc/rdoc/files/lib/hobix_rb.html +127 -0
  69. data/doc/rdoc/fr_class_index.html +67 -0
  70. data/doc/rdoc/fr_file_index.html +44 -0
  71. data/doc/rdoc/fr_method_index.html +307 -0
  72. data/doc/rdoc/index.html +24 -0
  73. data/doc/rdoc/rdoc-style.css +208 -0
  74. data/git_hobix_update.php +13 -0
  75. data/lib/hobix/api.rb +91 -0
  76. data/lib/hobix/article.rb +22 -0
  77. data/lib/hobix/base.rb +480 -0
  78. data/lib/hobix/bixwik.rb +200 -0
  79. data/lib/hobix/commandline.rb +677 -0
  80. data/lib/hobix/comments.rb +98 -0
  81. data/lib/hobix/config.rb +39 -0
  82. data/lib/hobix/datamarsh.rb +110 -0
  83. data/lib/hobix/entry.rb +84 -0
  84. data/lib/hobix/facets/comments.rb +99 -0
  85. data/lib/hobix/facets/publisher.rb +314 -0
  86. data/lib/hobix/facets/trackbacks.rb +80 -0
  87. data/lib/hobix/linklist.rb +81 -0
  88. data/lib/hobix/out/atom.rb +101 -0
  89. data/lib/hobix/out/erb.rb +64 -0
  90. data/lib/hobix/out/okaynews.rb +55 -0
  91. data/lib/hobix/out/quick.rb +314 -0
  92. data/lib/hobix/out/rdf.rb +97 -0
  93. data/lib/hobix/out/redrum.rb +26 -0
  94. data/lib/hobix/out/rss.rb +128 -0
  95. data/lib/hobix/plugin/akismet.rb +196 -0
  96. data/lib/hobix/plugin/bloglines.rb +73 -0
  97. data/lib/hobix/plugin/calendar.rb +212 -0
  98. data/lib/hobix/plugin/flickr.rb +110 -0
  99. data/lib/hobix/plugin/recent_comments.rb +84 -0
  100. data/lib/hobix/plugin/sections.rb +91 -0
  101. data/lib/hobix/plugin/tags.rb +60 -0
  102. data/lib/hobix/publish/ping.rb +53 -0
  103. data/lib/hobix/publish/replicate.rb +283 -0
  104. data/lib/hobix/publisher.rb +18 -0
  105. data/lib/hobix/search/dictionary.rb +141 -0
  106. data/lib/hobix/search/porter_stemmer.rb +203 -0
  107. data/lib/hobix/search/simple.rb +209 -0
  108. data/lib/hobix/search/vector.rb +100 -0
  109. data/lib/hobix/storage/filesys.rb +408 -0
  110. data/lib/hobix/trackbacks.rb +93 -0
  111. data/lib/hobix/util/objedit.rb +193 -0
  112. data/lib/hobix/util/patcher.rb +155 -0
  113. data/lib/hobix/webapp/cli.rb +195 -0
  114. data/lib/hobix/webapp/htmlform.rb +107 -0
  115. data/lib/hobix/webapp/message.rb +177 -0
  116. data/lib/hobix/webapp/urigen.rb +141 -0
  117. data/lib/hobix/webapp/webrick-servlet.rb +90 -0
  118. data/lib/hobix/webapp.rb +723 -0
  119. data/lib/hobix/weblog.rb +893 -0
  120. data/lib/hobix.rb +230 -0
  121. data/share/default-blog/hobix.yaml +16 -0
  122. data/share/default-blog/htdocs/site.css +174 -0
  123. data/share/default-blog/skel/entry.html.quick +0 -0
  124. data/share/default-blog/skel/index.atom.atom +0 -0
  125. data/share/default-blog/skel/index.html.quick-summary +0 -0
  126. data/share/default-blog/skel/index.xml.rss +0 -0
  127. data/share/default-blog/skel/index.yaml.okaynews +0 -0
  128. data/share/default-blog/skel/monthly.html.quick-archive +0 -0
  129. data/share/default-blog/skel/section.html.quick-archive +0 -0
  130. data/share/default-blog/skel/yearly.html.quick-archive +0 -0
  131. data/share/default-blog-modes.yaml +7 -0
  132. data/share/default-blog.apache-cgi.patch +8 -0
  133. data/share/default-blog.apache-ssi.patch +38 -0
  134. data/share/default-blog.apache2-ssi.patch +3 -0
  135. data/share/default-blog.cgi.patch +8 -0
  136. data/share/default-blog.comments.patch +5 -0
  137. data/share/default-blog.prototype.patch +766 -0
  138. data/share/default-blog.publisher.patch +5 -0
  139. data/share/default-blog.wiki.patch +29 -0
  140. data/share/publisher/css/control.css +90 -0
  141. data/share/publisher/css/form.css +238 -0
  142. data/share/publisher/css/form.import.css +72 -0
  143. data/share/publisher/css/main-menu.css +134 -0
  144. data/share/publisher/i/hobix-emblazen-1.png +0 -0
  145. data/share/publisher/i/hobix-emblazen-2.png +0 -0
  146. data/share/publisher/i/hobix-emblazen-3.png +0 -0
  147. data/share/publisher/i/hobix-emblazen-4.png +0 -0
  148. data/share/publisher/i/hobix-emblazen-5.png +0 -0
  149. data/share/publisher/i/hobix-emblazen-6.png +0 -0
  150. data/share/publisher/i/hobix-emblazen-7.png +0 -0
  151. data/share/publisher/index.erb +66 -0
  152. data/share/publisher/js/controls.js +261 -0
  153. data/share/publisher/js/dragdrop.js +476 -0
  154. data/share/publisher/js/effects.js +570 -0
  155. data/share/publisher/js/prototype.js +1011 -0
  156. metadata +230 -0
@@ -0,0 +1,203 @@
1
+ #! /local/ruby/bin/ruby
2
+ #
3
+ # $Id$
4
+ #
5
+ # Lifted from SimpleSearch by Chad Fowler / Dave Thomas / Allen Condit / perhaps other unseeable folks in the distance ...
6
+ #
7
+ # See example usage at the end of this file.
8
+ #
9
+
10
+ module Hobix
11
+ module Stemmable
12
+
13
+ STEMMED = {}
14
+
15
+ STEP_2_LIST = {
16
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
17
+ 'izer'=>'ize', 'bli'=>'ble',
18
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
19
+ 'ization'=>'ize', 'ation'=>'ate',
20
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
21
+ 'ousness'=>'ous', 'aliti'=>'al',
22
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
23
+ }
24
+
25
+ STEP_3_LIST = {
26
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
27
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
28
+ }
29
+
30
+
31
+ SUFFIX_1_REGEXP = /(
32
+ ational |
33
+ tional |
34
+ enci |
35
+ anci |
36
+ izer |
37
+ bli |
38
+ alli |
39
+ entli |
40
+ eli |
41
+ ousli |
42
+ ization |
43
+ ation |
44
+ ator |
45
+ alism |
46
+ iveness |
47
+ fulness |
48
+ ousness |
49
+ aliti |
50
+ iviti |
51
+ biliti |
52
+ logi)$/x
53
+
54
+
55
+ SUFFIX_2_REGEXP = /(
56
+ al |
57
+ ance |
58
+ ence |
59
+ er |
60
+ ic |
61
+ able |
62
+ ible |
63
+ ant |
64
+ ement |
65
+ ment |
66
+ ent |
67
+ ou |
68
+ ism |
69
+ ate |
70
+ iti |
71
+ ous |
72
+ ive |
73
+ ize)$/x
74
+
75
+
76
+ C = "[^aeiou]" # consonant
77
+ V = "[aeiouy]" # vowel
78
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
79
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
80
+
81
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
82
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
83
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
84
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
85
+
86
+ #
87
+ # Porter stemmer in Ruby.
88
+ #
89
+ # This is the Porter stemming algorithm, ported to Ruby from the
90
+ # version coded up in Perl. It's easy to follow against the rules
91
+ # in the original paper in:
92
+ #
93
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
94
+ # no. 3, pp 130-137,
95
+ #
96
+ # See also http://www.tartarus.org/~martin/PorterStemmer
97
+ #
98
+ # Send comments to raypereda@hotmail.com
99
+ #
100
+
101
+ def stem_porter(w = self.to_str.dup)
102
+
103
+ # make a copy of the given object and convert it to a string.
104
+ original_word = w
105
+
106
+ return w if w.length < 3
107
+
108
+ result = STEMMED[w]
109
+ return result if result
110
+
111
+ # now map initial y to Y so that the patterns never treat it as vowel
112
+ w[0] = 'Y' if w[0] == ?y
113
+
114
+ # Step 1a
115
+ if w =~ /(ss|i)es$/
116
+ w = $` + $1
117
+ elsif w =~ /([^s])s$/
118
+ w = $` + $1
119
+ end
120
+
121
+ # Step 1b
122
+ if w =~ /eed$/
123
+ w.chop! if $` =~ MGR0
124
+ elsif w =~ /(ed|ing)$/
125
+ stem = $`
126
+ if stem =~ VOWEL_IN_STEM
127
+ w = stem
128
+ case w
129
+ when /(at|bl|iz)$/ then w << "e"
130
+ when /([^aeiouylsz])\1$/ then w.chop!
131
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
132
+ end
133
+ end
134
+ end
135
+
136
+ if w =~ /y$/
137
+ stem = $`
138
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
139
+ end
140
+
141
+ # Step 2
142
+ if w =~ SUFFIX_1_REGEXP
143
+ stem = $`
144
+ suffix = $1
145
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
146
+ if stem =~ MGR0
147
+ w = stem + STEP_2_LIST[suffix]
148
+ end
149
+ end
150
+
151
+ # Step 3
152
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
153
+ stem = $`
154
+ suffix = $1
155
+ if stem =~ MGR0
156
+ w = stem + STEP_3_LIST[suffix]
157
+ end
158
+ end
159
+
160
+ # Step 4
161
+ if w =~ SUFFIX_2_REGEXP
162
+ stem = $`
163
+ if stem =~ MGR1
164
+ w = stem
165
+ end
166
+ elsif w =~ /(s|t)(ion)$/
167
+ stem = $` + $1
168
+ if stem =~ MGR1
169
+ w = stem
170
+ end
171
+ end
172
+
173
+ # Step 5
174
+ if w =~ /e$/
175
+ stem = $`
176
+ if (stem =~ MGR1) ||
177
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
178
+ w = stem
179
+ end
180
+ end
181
+
182
+ if w =~ /ll$/ && w =~ MGR1
183
+ w.chop!
184
+ end
185
+
186
+ # and turn initial Y back to y
187
+ w[0] = 'y' if w[0] == ?Y
188
+
189
+ STEMMED[original_word] = w
190
+
191
+ w
192
+ end
193
+
194
+
195
+ module_function :stem_porter
196
+ #
197
+ # make the stem_porter the default stem method, just in case we
198
+ # feel like having multiple stemmers available later.
199
+ #
200
+ alias stem stem_porter
201
+
202
+ end
203
+ end
@@ -0,0 +1,209 @@
1
+ require 'hobix/search/dictionary'
2
+ require 'hobix/search/vector'
3
+
4
+ module Hobix
5
+ module Search
6
+ module Simple
7
+ class Contents < Array
8
+ def latest_mtime
9
+ latest_mtime = Time.at(0)
10
+ each do |item|
11
+ if(item.mtime > latest_mtime)
12
+ latest_mtime = item.mtime
13
+ end
14
+ end
15
+ end
16
+ end
17
+
18
+ class Content
19
+ attr_accessor :content, :identifier, :mtime, :classifications
20
+ def initialize(content, identifier, mtime, clsf)
21
+ @content = content
22
+ @identifier = identifier
23
+ @mtime = mtime
24
+ @classifications = clsf
25
+ end
26
+ end
27
+
28
+ SearchResult = Struct.new(:name, :score)
29
+
30
+ class SearchResult
31
+ # enable sort by score
32
+ def <=>(other)
33
+ self.score <=> other.score
34
+ end
35
+ end
36
+
37
+ class SearchResults
38
+ attr_reader :warnings
39
+ attr_reader :results
40
+
41
+
42
+ def initialize
43
+ @warnings = []
44
+ @results = {}
45
+ end
46
+
47
+ def add_warning(txt)
48
+ @warnings << txt
49
+ end
50
+
51
+ def add_result(name, score)
52
+ @results[name] = SearchResult.new(name, score)
53
+ end
54
+
55
+ def contains_matches
56
+ !@results.empty?
57
+ end
58
+ end
59
+
60
+
61
+ class Searcher
62
+
63
+ def initialize(dict, document_vectors, cache_file)
64
+ @dict = dict
65
+ @document_vectors = document_vectors
66
+ @cache_file = cache_file
67
+ end
68
+
69
+ # Return SearchResults based on trying to find the array of
70
+ # +words+ in our document vectors
71
+ #
72
+ # A word beginning '+' _must_ appear in the target documents
73
+ # A word beginning '-' <i>must not</i> appear
74
+ # other words are scored. The documents with the highest
75
+ # scores are returned first
76
+
77
+ def find_words(words)
78
+ search_results = SearchResults.new
79
+
80
+ general = Vector.new
81
+ must_match = Vector.new
82
+ must_not_match = Vector.new
83
+ not_found = false
84
+
85
+ extract_words_for_searcher(words.join(' ')) do |word|
86
+ case word[0]
87
+ when ?+
88
+ word = word[1,99]
89
+ vector = must_match
90
+ when ?-
91
+ word = word[1,99]
92
+ vector = must_not_match
93
+ else
94
+ vector = general
95
+ end
96
+
97
+ index = @dict.find(word.downcase)
98
+ if index
99
+ vector.add_word_index(index)
100
+ else
101
+ not_found = true
102
+ search_results.add_warning "'#{word}' does not occur in the documents"
103
+ end
104
+ end
105
+
106
+ if (general.num_bits + must_match.num_bits).zero?
107
+ search_results.add_warning "No valid search terms given"
108
+ elsif not not_found
109
+ res = []
110
+ @document_vectors.each do |entry, (dvec, mtime)|
111
+ score = dvec.score_against(must_match, must_not_match, general)
112
+ res << [ entry, score ] if score > 0
113
+ end
114
+
115
+ res.sort {|a,b| b[1] <=> a[1] }.each {|name, score|
116
+ search_results.add_result(name, score)
117
+ }
118
+
119
+ search_results.add_warning "No matches" unless search_results.contains_matches
120
+ end
121
+ search_results
122
+ end
123
+
124
+
125
+ # Serialization support. At some point we'll need to do incremental indexing.
126
+ # For now, however, the following seems to work fairly effectively
127
+ # on 1000 entry blogs, so I'll defer the change until later.
128
+ def Searcher.load(cache_file, wash=false)
129
+ dict = document_vectors = nil
130
+ modified = false
131
+ loaded = false
132
+ begin
133
+ File.open(cache_file, "r") do |f|
134
+ unless wash
135
+ dict = Marshal.load(f)
136
+ document_vectors = Marshal.load(f)
137
+ loaded = true
138
+ end
139
+ end
140
+ rescue
141
+ ;
142
+ end
143
+
144
+ unless loaded
145
+ dict = Dictionary.new
146
+ document_vectors = {}
147
+ modified = true
148
+ end
149
+
150
+ s = Searcher.new(dict, document_vectors, cache_file)
151
+ s.dump if modified
152
+ s
153
+ end
154
+
155
+ def dump
156
+ File.open(@cache_file, "w") do |fileInstance|
157
+ Marshal.dump(@dict, fileInstance)
158
+ Marshal.dump(@document_vectors, fileInstance)
159
+ end
160
+ end
161
+
162
+ def extract_words_for_searcher(text)
163
+ text.scan(/[-+]?\w[\-\w:\\]{2,}/) do |word|
164
+ yield word
165
+ end
166
+ end
167
+
168
+ def has_entry? id, mtime
169
+ dvec = @document_vectors[id]
170
+ return true if dvec and dvec.at.to_i >= mtime.to_i
171
+ end
172
+
173
+ # Create a new dictionary and document vectors from
174
+ # a blog archive
175
+
176
+ def catalog(entry)
177
+ unless has_entry? entry.identifier, entry.mtime
178
+ vector = Vector.new
179
+ vector.at = entry.mtime
180
+ extract_words_for_searcher(entry.content.downcase) do |word|
181
+ word_index = @dict.add_word(word, entry.classifications)
182
+ if word_index
183
+ vector.add_word_index(word_index)
184
+ end
185
+ end
186
+ @document_vectors[entry.identifier] = vector
187
+ end
188
+ end
189
+
190
+ def classifications(text)
191
+ score = Hash.new
192
+ @dict.clsf.each do |category, category_words|
193
+ score[category] = 0
194
+ total = category_words.values.inject(0) {|sum, element| sum+element}
195
+ extract_words_for_searcher(text) do |word|
196
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
197
+ score[category] += Math.log(s/total.to_f)
198
+ end
199
+ end
200
+ score
201
+ end
202
+
203
+ def classify(text)
204
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,100 @@
1
+ # Maintain a vector of words, where a word is represented by
2
+ # its index in our Dictionary
3
+ #
4
+ module Hobix
5
+ module Search
6
+ module Simple
7
+ class Vector
8
+
9
+ attr_accessor :at
10
+ attr_reader :num_bits, :max_bit, :bits
11
+
12
+ def initialize
13
+ # @bits = []
14
+ @bits = 0
15
+ @max_bit = -1
16
+ @num_bits = 0
17
+ end
18
+
19
+ def add_word_index(index)
20
+ if @bits[index].zero?
21
+ @bits += (1 << index)
22
+ @num_bits += 1
23
+ @max_bit = index if @max_bit < index
24
+ end
25
+ end
26
+
27
+ def dot(vector)
28
+ # We only need to calculate up to the end of the shortest vector
29
+ limit = @max_bit
30
+ # Commenting out the next line makes this vector the dominant
31
+ # one when doing the comparison
32
+ limit = vector.max_bit if limit > vector.max_bit
33
+
34
+ # because both vectors have just ones or zeros in them,
35
+ # we can pre-calculate the AnBn component
36
+ # The vector's magnitude is Sqrt(num set bits)
37
+ factor = Math.sqrt(1.0/@num_bits) * Math.sqrt(1.0/vector.num_bits)
38
+
39
+ count = 0
40
+ (limit+1).times {|i| count += 1 if @bits[i] ==1 && vector.bits[i] == 1}
41
+
42
+ factor * count
43
+ end
44
+
45
+ # We're a document's vector, and we're being matched against
46
+ # three other vectors:
47
+ # 1. A list of <i>must match</i> words
48
+ # 2. A list of <i>must not match</i> words
49
+ # 3. A list of general words. The score we return
50
+ # is the number of these that we match
51
+
52
+ def score_against(must_match, must_not_match, general)
53
+ # Eliminate if any _must_not_match_ words found
54
+ unless must_not_match.num_bits.zero?
55
+ return 0 unless (@bits & must_not_match.bits).zero?
56
+ end
57
+
58
+ # If the match was entirely negative, then we know we're passed at
59
+ # this point
60
+
61
+ if must_match.num_bits.zero? and general.num_bits.zero?
62
+ return 1
63
+ end
64
+
65
+ count = 0
66
+
67
+ # Eliminate unless all _must_match_ words found
68
+
69
+ unless must_match.num_bits.zero?
70
+ return 0 unless (@bits & must_match.bits) == must_match.bits
71
+ count = 1
72
+ end
73
+
74
+ # finally score on the rest
75
+ common = general.bits & @bits
76
+ count += count_bits(common, @max_bit+1) unless common.zero?
77
+ count
78
+ end
79
+
80
+ private
81
+
82
+ def count_bits(word, max_bit)
83
+ res = 0
84
+ ((max_bit+29)/30).times do |offset|
85
+ x = (word >> (offset*30)) & 0x3fffffff
86
+ next if x.zero?
87
+ x = x - ((x >> 1) & 0x55555555)
88
+ x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
89
+ x = (x + (x >> 4)) & 0x0f0f0f0f;
90
+ x = x + (x >> 8)
91
+ x = x + (x >> 16)
92
+ res += x & 0x3f
93
+ end
94
+ res
95
+ end
96
+
97
+ end
98
+ end
99
+ end
100
+ end