fastri 0.1.1.1 → 0.2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ # Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
2
+ #
3
+
4
+ require 'fastri/version'
5
+
6
+ module FastRI
7
+
8
+ class FullTextIndexer
9
+ WORD_RE = /[A-Za-z0-9_]+/
10
+ NONWORD_RE = /[^A-Za-z0-9_]+/
11
+ MAGIC = "FastRI full-text index #{FASTRI_FT_INDEX_FORMAT}\0"
12
+
13
+ def initialize(max_querysize)
14
+ @documents = []
15
+ @doc_hash = {}
16
+ @max_wordsize = max_querysize
17
+ end
18
+
19
+ def add_document(name, data, metadata = {})
20
+ @doc_hash[name] = [data, metadata]
21
+ @documents << name
22
+ end
23
+
24
+ def data(name)
25
+ @doc_hash[name][0]
26
+ end
27
+
28
+ def documents
29
+ @documents = @documents.uniq
30
+ end
31
+
32
+ def preprocess(str)
33
+ str.gsub(/\0/,"")
34
+ end
35
+
36
+ require 'strscan'
37
+ def find_suffixes(text, offset)
38
+ find_suffixes_simple(text, WORD_RE, NONWORD_RE, offset)
39
+ end
40
+
41
+ def find_suffixes_simple(string, word_re, nonword_re, offset)
42
+ suffixes = []
43
+ sc = StringScanner.new(string)
44
+ until sc.eos?
45
+ sc.skip(nonword_re)
46
+ len = string.size
47
+ loop do
48
+ break if sc.pos == len
49
+ suffixes << offset + sc.pos
50
+ skipped_word = sc.skip(word_re)
51
+ break unless skipped_word
52
+ loop do
53
+ skipped_nonword = sc.skip(nonword_re)
54
+ break unless skipped_nonword
55
+ end
56
+ end
57
+ end
58
+ suffixes
59
+ end
60
+
61
+ require 'enumerator'
62
+ def build_index(full_text_IO, suffix_array_IO)
63
+ fulltext = ""
64
+ io = StringIO.new(fulltext)
65
+ io.write MAGIC
66
+ full_text_IO.write MAGIC
67
+ documents.each do |doc|
68
+ data, metadata = @doc_hash[doc]
69
+ io.write(data)
70
+ full_text_IO.write(data)
71
+ meta_txt = Marshal.dump(metadata)
72
+ footer = "\0....#{doc}\0#{meta_txt}\0"
73
+ footer[1,4] = [footer.size - 5].pack("V")
74
+ io.write(footer)
75
+ full_text_IO.write(footer)
76
+ end
77
+
78
+ scanner = StringScanner.new(fulltext)
79
+ scanner.scan(Regexp.new(Regexp.escape(MAGIC)))
80
+
81
+ count = 0
82
+ suffixes = []
83
+ until scanner.eos?
84
+ count += 1
85
+ start = scanner.pos
86
+ text = scanner.scan_until(/\0/)
87
+ suffixes.concat find_suffixes(text[0..-2], start)
88
+ len = scanner.scan(/..../).unpack("V")[0]
89
+ #puts "LEN: #{len} #{scanner.pos} #{scanner.string.size}"
90
+ #puts "#{scanner.string[scanner.pos,20].inspect}"
91
+ scanner.pos += len
92
+ #scanner.terminate if !text
93
+ end
94
+ sorted = suffixes.sort_by{|x| fulltext[x, @max_wordsize]}
95
+ sorted.each_slice(10000){|x| suffix_array_IO.write x.pack("V*")}
96
+ nil
97
+ end
98
+ end # class FullTextIndexer
99
+
100
+ end # module FastRI
@@ -3,9 +3,37 @@
3
3
 
4
4
  require 'rdoc/ri/ri_cache'
5
5
  require 'rdoc/ri/ri_reader'
6
+ require 'rdoc/ri/ri_descriptions'
6
7
  require 'fastri/version'
7
8
 
8
9
 
10
+ # This is taken straight from 1.8.5's rdoc/ri/ri_descriptions.rb.
11
+ # Older releases have a buggy #merge_in that crashes when old.comment is nil.
12
+ if RUBY_RELEASE_DATE < "2006-06-15"
13
+ module ::RI # :nodoc:
14
+ class ModuleDescription # :nodoc:
15
+ remove_method :merge_in
16
+ # merge in another class desscription into this one
17
+ def merge_in(old)
18
+ merge(@class_methods, old.class_methods)
19
+ merge(@instance_methods, old.instance_methods)
20
+ merge(@attributes, old.attributes)
21
+ merge(@constants, old.constants)
22
+ merge(@includes, old.includes)
23
+ if @comment.nil? || @comment.empty?
24
+ @comment = old.comment
25
+ else
26
+ unless old.comment.nil? or old.comment.empty? then
27
+ @comment << SM::Flow::RULE.new
28
+ @comment.concat old.comment
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+
9
37
  module FastRI
10
38
 
11
39
  # This class provides the same functionality as RiReader, with some
@@ -466,6 +494,8 @@ class RiIndex
466
494
  when /[#.]\S+/
467
495
  method_entry = get_entry(@method_array, entry_or_name, MethodEntry, nil)
468
496
  source_paths_for(method_entry)
497
+ when ""
498
+ []
469
499
  else
470
500
  class_entry = get_entry(@namespace_array, entry_or_name, ClassEntry, nil)
471
501
  source_paths_for(class_entry)
@@ -273,6 +273,12 @@ class RiService
273
273
  m.add_matcher(:partial_ci) do
274
274
  m.yield @ri_reader.methods_under_matching("", /#{sep_re}#{name}/i, true)
275
275
  end
276
+ m.add_matcher(:anywhere) do
277
+ m.yield @ri_reader.methods_under_matching("", /#{sep_re}.*#{name}/, true)
278
+ end
279
+ m.add_matcher(:anywhere_ci) do
280
+ m.yield @ri_reader.methods_under_matching("", /#{sep_re}.*#{name}/i, true)
281
+ end
276
282
  end
277
283
  matcher.get_matches(order)
278
284
  end
@@ -0,0 +1,83 @@
1
+ # Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
2
+
3
+ require 'rdoc/ri/ri_paths'
4
+ begin
5
+ require 'rubygems'
6
+ rescue LoadError
7
+ end
8
+
9
+ require 'rdoc/ri/ri_writer'
10
+
11
+ module FastRI
12
+ module Util
13
+ # Return an array of <tt>[name, version, path]</tt> arrays corresponding to
14
+ # the last version of each installed gem. +path+ is the base path of the RI
15
+ # documentation from the gem. If the version cannot be determined, it will
16
+ # be +nil+, and the corresponding gem might be repeated in the output array
17
+ # (once per version).
18
+ def gem_directories_unique
19
+ return [] unless defined? Gem
20
+ gemdirs = Dir["#{Gem.path}/doc/*/ri"]
21
+ gems = Hash.new{|h,k| h[k] = []}
22
+ gemdirs.each do |path|
23
+ gemname, version = %r{/([^/]+)-(.*)/ri$}.match(path).captures
24
+ if gemname.nil? # doesn't follow any conventions :(
25
+ gems[path[%r{/([^/]+)/ri$}, 1]] << [nil, path]
26
+ else
27
+ gems[gemname] << [version, path]
28
+ end
29
+ end
30
+ gems.sort_by{|name, _| name}.map do |name, versions|
31
+ version, path = versions.sort.last
32
+ [name, version, File.expand_path(path)]
33
+ end
34
+ end
35
+ module_function :gem_directories_unique
36
+
37
+ # Return the <tt>[name, version, path]</tt> array for the gem owning the RI
38
+ # information stored in +path+, or +nil+.
39
+ def gem_info_for_path(path, gem_dir_info = FastRI::Util.gem_directories_unique)
40
+ path = File.expand_path(path)
41
+ matches = gem_dir_info.select{|name, version, gem_path| path.index(gem_path) == 0}
42
+ matches.sort_by{|name, version, gem_path| [gem_path.size, version, name]}.last
43
+ end
44
+ module_function :gem_info_for_path
45
+
46
+ # Return the +full_name+ (in ClassEntry or MethodEntry's sense) given a path
47
+ # to a .yaml file relative to a "base RI DB path".
48
+ def gem_relpath_to_full_name(relpath)
49
+ case relpath
50
+ when %r{^(.*)/cdesc-([^/]*)\.yaml$}
51
+ path, name = $~.captures
52
+ (path.split(%r{/})[0..-2] << name).join("::")
53
+ when %r{^(.*)/([^/]*)-(i|c)\.yaml$}
54
+ path, escaped_name, type = $~.captures
55
+ name = RI::RiWriter.external_to_internal(escaped_name)
56
+ sep = ( type == 'c' ) ? "." : "#"
57
+ path.gsub("/", "::") + sep + name
58
+ end
59
+ end
60
+ module_function :gem_relpath_to_full_name
61
+
62
+ # Returns the home directory (win32-aware).
63
+ def find_home
64
+ # stolen from RubyGems
65
+ ['HOME', 'USERPROFILE'].each do |homekey|
66
+ return ENV[homekey] if ENV[homekey]
67
+ end
68
+ if ENV['HOMEDRIVE'] && ENV['HOMEPATH']
69
+ return "#{ENV['HOMEDRIVE']}:#{ENV['HOMEPATH']}"
70
+ end
71
+ begin
72
+ File.expand_path("~")
73
+ rescue StandardError => ex
74
+ if File::ALT_SEPARATOR
75
+ "C:/"
76
+ else
77
+ "/"
78
+ end
79
+ end
80
+ end
81
+ module_function :find_home
82
+ end # module Util
83
+ end # module FastRI
@@ -2,7 +2,12 @@
2
2
  #
3
3
 
4
4
  module FastRI
5
- FASTRI_VERSION = "0.1.1"
5
+ FASTRI_VERSION = "0.2.0"
6
+ FASTRI_RELEASE_DATE = "2006-11-15"
6
7
  FASTRI_INDEX_FORMAT = "0.1.0"
8
+ FASTRI_FT_INDEX_FORMAT = "0.0.0"
9
+ FASTRI_FT_INDEX_FORMAT_MAJOR = "0"
10
+ FASTRI_FT_INDEX_FORMAT_MINOR = "0"
11
+ FASTRI_FT_INDEX_FORMAT_TEENY = "0"
7
12
  end
8
13
  # vi: set sw=2 expandtab:
@@ -0,0 +1,182 @@
1
+ require 'test/unit'
2
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
3
+ $:.unshift "lib"
4
+ require 'fastri/full_text_index'
5
+
6
+ class TestFullTextIndex < Test::Unit::TestCase
7
+ require 'stringio'
8
+ include FastRI
9
+
10
+ magic = FullTextIndexer::MAGIC
11
+ data = <<EOF
12
+ #{magic}this is a test
13
+ \r\000\000\000foo.txt\000\004\b{\000
14
+ zzzz
15
+ \r\000\000\000bar.txt\000\004\b{\000
16
+ EOF
17
+ DATA = (data.split(/\n/) << "").join("\0")
18
+ SUFFIXES = %w[a\ test is\ a test this zzzz].map{|w| [DATA.index(w)].pack("V")}.join("")
19
+
20
+ data = <<EOF
21
+ #{magic}this is a test
22
+ \r\000\000\000foo.txt\000\004\b{\000
23
+ zzzz this
24
+ \r\000\000\000bar.txt\000\004\b{\000
25
+ EOF
26
+ DATA2 = (data.split(/\n/) << "").join("\0")
27
+ SUFFIXES2 = ["a test", "is a", "test", "this\0", "this", "zzzz"].map{|x| [DATA2.index(x)].pack("V")}.join("")
28
+
29
+ data = <<EOF
30
+ #{magic}this is a test
31
+ SIZ1foo.txt\000#{Marshal.dump({:foo => :bar, :bar => 1})}
32
+ zzzz this
33
+ SIZ2bar.txt\000#{Marshal.dump({:foo => :baz, :bar => 42})}
34
+ EOF
35
+ lines = data.split(/\n/)
36
+ len1 = lines[1].size - 4 + 1
37
+ lines[1].sub!(/SIZ1/, [len1].pack("V"))
38
+ len2 = lines[3].size - 4 + 1
39
+ lines[3].sub!(/SIZ2/, [len2].pack("V"))
40
+ DATA3 = (lines << "").join("\0")
41
+ SUFFIXES3 = ["a test", "is a", "test", "this\0", "this", "zzzz"].map{|x| [DATA3.index(x)].pack("V")}.join("")
42
+
43
+ def setup
44
+ @index = FullTextIndex.new_from_ios(StringIO.new(DATA), StringIO.new(SUFFIXES))
45
+ @index2 = FullTextIndex.new_from_ios(StringIO.new(DATA2), StringIO.new(SUFFIXES2))
46
+ @index3 = FullTextIndex.new_from_ios(StringIO.new(DATA3), StringIO.new(SUFFIXES3))
47
+ end
48
+
49
+ def test_new_from_ios
50
+ a = nil
51
+ assert_nothing_raised { a = FullTextIndex.new_from_ios(StringIO.new(DATA), StringIO.new(SUFFIXES)) }
52
+ assert_equal(FullTextIndex::DEFAULT_OPTIONS[:max_query_size], a.max_query_size)
53
+ end
54
+
55
+ def test_lookup_basic
56
+ %w[this is a test].each do |term|
57
+ result = @index.lookup(term)
58
+ assert_kind_of(FullTextIndex::Result, result)
59
+ assert_equal(term, result.query)
60
+ assert_equal("foo.txt", result.path)
61
+ end
62
+ assert_equal(0, @index.lookup("a").index)
63
+ assert_equal(2, @index.lookup("t").index)
64
+ assert_equal(3, @index.lookup("th").index)
65
+
66
+ assert_equal(4, @index.lookup("z").index)
67
+ assert_equal("bar.txt", @index.lookup("z").path)
68
+ end
69
+
70
+ def test_lookup_metadata
71
+ assert_equal({}, @index.lookup("test").metadata)
72
+ assert_equal({}, @index.lookup("zzzz").metadata)
73
+ assert_equal({:foo => :bar, :bar => 1}, @index3.lookup("test").metadata)
74
+ assert_equal({:foo => :baz, :bar => 42}, @index3.lookup("zzz").metadata)
75
+ end
76
+
77
+ def test_Result_text
78
+ assert_equal("t", @index.lookup("this").text(1))
79
+ assert_equal("this", @index.lookup("this").text(4))
80
+ assert_equal("this is a ", @index.lookup("this").text(10))
81
+ assert_equal("this is a test ", @index.lookup("th").text(100))
82
+
83
+ assert_equal("test ", @index.lookup("t").text(10))
84
+ assert_equal("test ", @index.lookup("t").text(20))
85
+
86
+ assert_equal("z", @index.lookup("z").text(1))
87
+ assert_equal("zzzz", @index.lookup("z").text(10))
88
+ end
89
+
90
+ def test_Result_context
91
+ assert_equal(" a ", @index.lookup("a").context(1))
92
+ assert_equal("s a t", @index.lookup("a").context(2))
93
+ assert_equal("is a te", @index.lookup("a").context(3))
94
+ assert_equal("s is a test", @index.lookup("a").context(5))
95
+ assert_equal("this is a test ", @index.lookup("a").context(10))
96
+ end
97
+
98
+ def test_Result_context_non_initial_entry
99
+ assert_equal("zz", @index.lookup("z").context(1))
100
+ assert_equal("zzz", @index.lookup("z").context(2))
101
+ assert_equal("zzzz", @index.lookup("z").context(3))
102
+ assert_equal("zzzz", @index.lookup("z").context(4))
103
+ assert_equal("zzzz", @index.lookup("z").context(10))
104
+ end
105
+
106
+ def test_lookup_nonexistent
107
+ assert_nil(@index.lookup("bogus"))
108
+ end
109
+
110
+ def test_next_match_basic
111
+ first = @index2.lookup("t")
112
+ assert_equal("foo.txt", first.path)
113
+ assert_equal(2, first.index)
114
+ assert_equal("test ", first.text(10))
115
+
116
+ second = @index2.next_match(first)
117
+ assert_equal("bar.txt", second.path)
118
+ assert_equal(3, second.index)
119
+ assert_equal("this", second.text(10))
120
+
121
+ third = @index2.next_match(second)
122
+ assert_kind_of(FullTextIndex::Result, third)
123
+ assert_equal(4, third.index)
124
+ assert_equal("this is a ", third.text(10))
125
+
126
+ assert_nil(@index2.next_match(third))
127
+ end
128
+
129
+ def test_next_match_restricted
130
+ first = @index2.lookup("t")
131
+ assert_equal("foo.txt", first.path)
132
+ assert_equal(2, first.index)
133
+ assert_equal("test ", first.text(10))
134
+
135
+ second = @index2.next_match(first, "this is")
136
+ assert_equal("foo.txt", second.path)
137
+ assert_equal(4, second.index)
138
+ assert_equal("this is a ", second.text(10))
139
+
140
+ assert_nil(@index2.next_match(first, "foo"))
141
+ end
142
+
143
+ def test_next_match_regexp
144
+ first = @index2.lookup("t")
145
+ assert_equal("foo.txt", first.path)
146
+ assert_equal(2, first.index)
147
+ assert_equal("test ", first.text(10))
148
+
149
+ second = @index2.next_match(first, /.*test/)
150
+ assert_equal("foo.txt", second.path)
151
+ assert_equal(4, second.index)
152
+ assert_equal("this is a test ", second.text(20))
153
+ end
154
+
155
+
156
+ def test_next_matches
157
+ first = @index2.lookup("t")
158
+ all = [first] + @index2.next_matches(first)
159
+ assert_equal([2, 3, 4], all.map{|x| x.index})
160
+ assert_equal(["foo.txt", "bar.txt", "foo.txt"], all.map{|x| x.path})
161
+ one, two, three = *all
162
+ assert_equal(["test ", "this", "this is a test "], all.map{|x| x.text(20)})
163
+ end
164
+
165
+ def test_next_matches_restricted
166
+ first = @index2.lookup("t")
167
+ assert_equal([], @index2.next_matches(first, "this is not"))
168
+ all = @index2.next_matches(first, "this is")
169
+ assert_equal(["foo.txt"], all.map{|x| x.path})
170
+ assert_equal([4], all.map{|x| x.index})
171
+ assert_equal(["this is a test "], all.map{|x| x.text(20)})
172
+ end
173
+
174
+ def test_next_matches_regexp
175
+ first = @index2.lookup("t")
176
+ all = @index2.next_matches(first, /.*test/)
177
+ assert_equal(["foo.txt"], all.map{|x| x.path})
178
+ assert_equal([4], all.map{|x| x.index})
179
+ assert_equal(["this is a test "], all.map{|x| x.text(20)})
180
+ end
181
+
182
+ end
@@ -0,0 +1,84 @@
1
+ require 'test/unit'
2
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
3
+ $:.unshift "lib"
4
+ require 'fastri/full_text_indexer'
5
+
6
+ class TestFullTextIndexer < Test::Unit::TestCase
7
+ require 'stringio'
8
+ include FastRI
9
+ def setup
10
+ @indexer = FullTextIndexer.new(20)
11
+ end
12
+
13
+ DATA1 = "this is a test " * 1000
14
+ DATA2 = "this is another test " * 1000
15
+ def test_add_document
16
+ @indexer.add_document("foo.txt", DATA1)
17
+ assert_equal(["foo.txt"], @indexer.documents)
18
+ assert_equal(DATA1, @indexer.data("foo.txt"))
19
+ @indexer.add_document("foo.txt", DATA2)
20
+ assert_equal(["foo.txt"], @indexer.documents)
21
+ assert_equal(DATA2, @indexer.data("foo.txt"))
22
+ @indexer.add_document("bar.txt", DATA2)
23
+ assert_equal(["foo.txt", "bar.txt"], @indexer.documents)
24
+ assert_equal(DATA2, @indexer.data("bar.txt"))
25
+ end
26
+
27
+ def test_preprocess
28
+ data = "this is a \0foo bar\0 bla"
29
+ assert_equal("this is a foo bar bla", @indexer.preprocess(data))
30
+ end
31
+
32
+ def test_find_suffixes_simple
33
+ data = <<EOF
34
+ this is a simple test with these words: Aaaa01 0.1 _asdA1
35
+ EOF
36
+ assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 47, 49, 51],
37
+ @indexer.find_suffixes_simple(data, /[A-Za-z0-9_]+/, /[^A-Za-z0-9_]+/,0))
38
+ assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 52],
39
+ @indexer.find_suffixes_simple(data, /[A-Za-z]+/, /[^A-Za-z]+/, 0))
40
+ assert_equal([0, 5, 8, 10, 17, 22, 27, 33, 40, 52].map{|x| x+10},
41
+ @indexer.find_suffixes_simple(data, /[A-Za-z]+/, /[^A-Za-z]+/, 10))
42
+ assert_equal([0, 1, 2, 3, 5, 6, 8, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20,
43
+ 22, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37,
44
+
45
+ 40, 41, 42, 43, 52, 53, 54, 55],
46
+ @indexer.find_suffixes_simple(data, /[A-Za-z]/, /[^A-Za-z]+/, 0))
47
+ assert_equal([0, 5], @indexer.find_suffixes_simple("abcd\ndefg", /\S+/, /\s+/, 0))
48
+ assert_equal([1, 6], @indexer.find_suffixes_simple("abcd\ndefg", /\S+/, /\s+/, 1))
49
+ end
50
+
51
+ def test_build_index_trivial
52
+ @indexer.add_document("foo.txt", DATA1)
53
+ fulltext = StringIO.new("")
54
+ suffixarray = StringIO.new("")
55
+ @indexer.build_index(fulltext, suffixarray)
56
+ assert_equal(["\000\r\000\000\000foo.txt\000\004\b{\000\000"],
57
+ fulltext.string[-200..-1].scan(/\0.*$/))
58
+ assert_equal(4000 * 4, suffixarray.string.size)
59
+ end
60
+
61
+ def build_index_test_helper(data, suffixes)
62
+ @indexer.add_document("foo.txt", data)
63
+ offset = FullTextIndexer::MAGIC.size
64
+ suffixes = suffixes.map{|x| x + offset}
65
+ sorted = suffixes.sort_by{|i| data[i - offset]}
66
+ f_io = StringIO.new("")
67
+ sa_io = StringIO.new("")
68
+ @indexer.build_index(f_io, sa_io)
69
+ assert_equal(sorted, sa_io.string.scan(/..../m).map{|x| x.unpack("V")[0]})
70
+ end
71
+
72
+ def test_build_index_harder
73
+ data = <<EOF
74
+ a bcd efghi jklmn opqrst
75
+ EOF
76
+ suffixes = [0, 2, 6, 12, 18]
77
+ build_index_test_helper(data, suffixes)
78
+ data = <<EOF
79
+ e xcd afghi zklmn bpqrst
80
+ EOF
81
+ suffixes = [0, 2, 6, 12, 18]
82
+ build_index_test_helper(data, suffixes)
83
+ end
84
+ end