pms 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +676 -0
- data/ChangeLog +5 -0
- data/README +71 -0
- data/Rakefile +26 -0
- data/lib/pms.rb +51 -0
- data/lib/pms/ext.rb +49 -0
- data/lib/pms/index.rb +112 -0
- data/lib/pms/proxy.rb +133 -0
- data/lib/pms/version.rb +27 -0
- data/spec/pms/ext_spec.rb +70 -0
- data/spec/pms/index_spec.rb +30 -0
- data/spec/pms/proxy_spec.rb +13 -0
- data/spec/pms_spec.rb +65 -0
- data/spec/spec_helper.rb +8 -0
- data/test_data/fox.txt +7 -0
- metadata +77 -0
data/ChangeLog
ADDED
data/README
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
= pms - Poor Man's Search
|
2
|
+
|
3
|
+
== VERSION
|
4
|
+
|
5
|
+
This documentation refers to pms version 0.0.1
|
6
|
+
|
7
|
+
|
8
|
+
== DESCRIPTION
|
9
|
+
|
10
|
+
Provides a simple searching facility for (nearly) arbitrary input. It allows
|
11
|
+
searching by Strings (exact match) or Regexps (full control, but slower).
|
12
|
+
Boolean operators AND, OR, and NOT are supported, as well as proximity
|
13
|
+
operators NEAR (with configurable distance) and ADJACENT (taking order into
|
14
|
+
account). You can chain operators and also group them into sub-queries.
|
15
|
+
|
16
|
+
Inspiration came from a discussion on ruby-talk, starting with message
|
17
|
+
ruby-talk:322014[http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/322014],
|
18
|
+
but PMS has a more document-centric attitude (everything that <tt>input#each</tt>
|
19
|
+
yields is considered a document of its own).
|
20
|
+
|
21
|
+
Example:
|
22
|
+
|
23
|
+
require 'pms/ext'
|
24
|
+
|
25
|
+
text = <<EOT
|
26
|
+
Hello world, how are you today? I said "Hello"
|
27
|
+
to the other guy but he would not answer although
|
28
|
+
all the world could hear me.
|
29
|
+
EOT
|
30
|
+
|
31
|
+
search = text.search('hello').near('world', 3)
|
32
|
+
|
33
|
+
p search.results
|
34
|
+
#=> [0]
|
35
|
+
|
36
|
+
p search.results_with_positions
|
37
|
+
#=> {0=>[0, 8]}
|
38
|
+
|
39
|
+
p search.matches
|
40
|
+
#=> ["Hello world, how are you today? I said \"Hello\"\n"]
|
41
|
+
|
42
|
+
You can find more examples in the <tt>spec/</tt> directory.
|
43
|
+
|
44
|
+
|
45
|
+
== LINKS
|
46
|
+
|
47
|
+
* <http://pms.rubyforge.org/>
|
48
|
+
* <http://rubyforge.org/projects/pms>
|
49
|
+
* <http://github.com/blackwinter/pms>
|
50
|
+
|
51
|
+
|
52
|
+
== AUTHORS
|
53
|
+
|
54
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
55
|
+
|
56
|
+
|
57
|
+
== LICENSE AND COPYRIGHT
|
58
|
+
|
59
|
+
Copyright (C) 2008 Jens Wille
|
60
|
+
|
61
|
+
pms is free software: you can redistribute it and/or modify it under the
|
62
|
+
terms of the GNU General Public License as published by the Free Software
|
63
|
+
Foundation, either version 3 of the License, or (at your option) any later
|
64
|
+
version.
|
65
|
+
|
66
|
+
pms is distributed in the hope that it will be useful, but WITHOUT ANY
|
67
|
+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
68
|
+
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
69
|
+
|
70
|
+
You should have received a copy of the GNU General Public License along with
|
71
|
+
pms. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require %q{lib/pms/version}
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'hen'
|
5
|
+
|
6
|
+
Hen.lay! {{
|
7
|
+
:rubyforge => {
|
8
|
+
:project => %q{pms},
|
9
|
+
:package => %q{pms},
|
10
|
+
:rdoc_dir => nil
|
11
|
+
},
|
12
|
+
|
13
|
+
:gem => {
|
14
|
+
:version => PMS::VERSION,
|
15
|
+
:summary => %q{Poor Man's Search},
|
16
|
+
:homepage => %q{http://pms.rubyforge.org/},
|
17
|
+
:files => FileList['lib/**/*.rb'].to_a,
|
18
|
+
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'test_data/**/*'].to_a,
|
19
|
+
:dependencies => %w[]
|
20
|
+
}
|
21
|
+
}}
|
22
|
+
rescue LoadError
|
23
|
+
abort "Please install the 'hen' gem first."
|
24
|
+
end
|
25
|
+
|
26
|
+
### Place your custom Rake tasks here.
|
data/lib/pms.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# pms - Poor Man's Search. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Jens Wille #
|
7
|
+
# #
|
8
|
+
# Authors: #
|
9
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
10
|
+
# #
|
11
|
+
# pms is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU General Public License as published by the Free Software #
|
13
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
14
|
+
# version. #
|
15
|
+
# #
|
16
|
+
# pms is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
19
|
+
# details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU General Public License along #
|
22
|
+
# with pms. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
require 'pms/index'
|
28
|
+
require 'pms/proxy'
|
29
|
+
|
30
|
+
class PMS
|
31
|
+
|
32
|
+
attr_reader :input, :index
|
33
|
+
|
34
|
+
def initialize(input)
|
35
|
+
@input = input
|
36
|
+
@index = Index.new(input)
|
37
|
+
end
|
38
|
+
|
39
|
+
def search(token = nil)
|
40
|
+
token ? TokenProxy.new(self, token) : Proxy.new(self).and { |*a| yield(*a) }
|
41
|
+
end
|
42
|
+
|
43
|
+
def results
|
44
|
+
@index.entries
|
45
|
+
end
|
46
|
+
|
47
|
+
def matches
|
48
|
+
@index.matches
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/lib/pms/ext.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of pms, Poor Man's Search. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Jens Wille #
|
7
|
+
# #
|
8
|
+
# Authors: #
|
9
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
10
|
+
# #
|
11
|
+
# pms is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU General Public License as published by the Free Software #
|
13
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
14
|
+
# version. #
|
15
|
+
# #
|
16
|
+
# pms is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
19
|
+
# details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU General Public License along #
|
22
|
+
# with pms. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
require 'pms'
|
28
|
+
|
29
|
+
module PMS::Ext
|
30
|
+
|
31
|
+
RECEIVERS = [String, IO, Array].freeze
|
32
|
+
|
33
|
+
def search(*args)
|
34
|
+
PMS.new(self).search(*args)
|
35
|
+
end
|
36
|
+
|
37
|
+
RECEIVERS.each { |klass|
|
38
|
+
klass.send(:include, self)
|
39
|
+
}
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
class File
|
44
|
+
|
45
|
+
def self.search(file, *args)
|
46
|
+
File.open(file.respond_to?(:path) ? file.path : file) { |f| f.search(*args) }
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
data/lib/pms/index.rb
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of pms, Poor Man's Search. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Jens Wille #
|
7
|
+
# #
|
8
|
+
# Authors: #
|
9
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
10
|
+
# #
|
11
|
+
# pms is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU General Public License as published by the Free Software #
|
13
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
14
|
+
# version. #
|
15
|
+
# #
|
16
|
+
# pms is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
19
|
+
# details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU General Public License along #
|
22
|
+
# with pms. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class PMS
|
28
|
+
|
29
|
+
class Index
|
30
|
+
|
31
|
+
TOKEN_RE = %r{\w+}o
|
32
|
+
|
33
|
+
attr_reader :input, :index, :entries
|
34
|
+
|
35
|
+
def initialize(input)
|
36
|
+
raise ArgumentError, "input must implement #each" unless input.respond_to?(:each)
|
37
|
+
|
38
|
+
@input = input
|
39
|
+
@index = Hash.new { |h, k| h[k] = Hash.new { |i, j| i[j] = [] } }
|
40
|
+
|
41
|
+
build_index
|
42
|
+
end
|
43
|
+
|
44
|
+
def doc_nums_with_positions(token)
|
45
|
+
case token
|
46
|
+
when String
|
47
|
+
index[mangle_token(token)]
|
48
|
+
when Regexp
|
49
|
+
res = {}
|
50
|
+
|
51
|
+
index.each { |key, value|
|
52
|
+
res.update(value) { |_, old, new| old | new } if key =~ token
|
53
|
+
}
|
54
|
+
|
55
|
+
res
|
56
|
+
else
|
57
|
+
raise TypeError, "expected String or Regexp, got #{token.class}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
alias_method :results_with_positions, :doc_nums_with_positions
|
62
|
+
|
63
|
+
def doc_nums(token)
|
64
|
+
doc_nums_with_positions(token).keys
|
65
|
+
end
|
66
|
+
|
67
|
+
alias_method :results, :doc_nums
|
68
|
+
|
69
|
+
def documents(doc_nums = default = Object.new)
|
70
|
+
@documents ||= get_documents
|
71
|
+
default ? @documents : doc_nums.map { |doc_num| @documents[doc_num] }
|
72
|
+
end
|
73
|
+
|
74
|
+
alias_method :matches, :documents
|
75
|
+
|
76
|
+
def doc(doc_num)
|
77
|
+
documents([doc_num]).first
|
78
|
+
end
|
79
|
+
|
80
|
+
alias_method :[], :doc
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def build_index
|
85
|
+
@documents, @entries = nil, []
|
86
|
+
doc_num = -1
|
87
|
+
|
88
|
+
input.each { |doc|
|
89
|
+
@entries << doc_num += 1
|
90
|
+
pos = -1
|
91
|
+
|
92
|
+
doc.scan(TOKEN_RE) { |token|
|
93
|
+
index[mangle_token(token)][doc_num] << pos += 1
|
94
|
+
}
|
95
|
+
}
|
96
|
+
end
|
97
|
+
|
98
|
+
def get_documents
|
99
|
+
input.rewind if input.respond_to?(:rewind)
|
100
|
+
|
101
|
+
docs = []
|
102
|
+
input.each { |doc| docs << doc }
|
103
|
+
docs
|
104
|
+
end
|
105
|
+
|
106
|
+
def mangle_token(token)
|
107
|
+
token.downcase
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
data/lib/pms/proxy.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of pms, Poor Man's Search. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Jens Wille #
|
7
|
+
# #
|
8
|
+
# Authors: #
|
9
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
10
|
+
# #
|
11
|
+
# pms is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU General Public License as published by the Free Software #
|
13
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
14
|
+
# version. #
|
15
|
+
# #
|
16
|
+
# pms is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
19
|
+
# details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU General Public License along #
|
22
|
+
# with pms. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class PMS
|
28
|
+
|
29
|
+
class Proxy
|
30
|
+
|
31
|
+
attr_reader :pms, :index, :results
|
32
|
+
|
33
|
+
def initialize(pms)
|
34
|
+
@pms = pms
|
35
|
+
@index = pms.index
|
36
|
+
@results = pms.results
|
37
|
+
end
|
38
|
+
|
39
|
+
def and(token = nil)
|
40
|
+
token ? apply_operator_with_token('and', token) :
|
41
|
+
apply_operator_with_block('and') { |*a| yield(*a) }
|
42
|
+
end
|
43
|
+
|
44
|
+
def or(token = nil)
|
45
|
+
token ? apply_operator_with_token('or', token) :
|
46
|
+
apply_operator_with_block('or') { |*a| yield(*a) }
|
47
|
+
end
|
48
|
+
|
49
|
+
def not(token = nil)
|
50
|
+
token ? apply_operator_with_token('not', token) :
|
51
|
+
apply_operator_with_block('not') { |*a| yield(*a) }
|
52
|
+
end
|
53
|
+
|
54
|
+
def matches
|
55
|
+
index.matches(results)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def apply_operator_with_token(op, token)
|
61
|
+
apply_operator(op, index.results(token))
|
62
|
+
end
|
63
|
+
|
64
|
+
def apply_operator_with_block(op)
|
65
|
+
case sub = yield(pms)
|
66
|
+
when Proxy
|
67
|
+
apply_operator(op, sub.results)
|
68
|
+
else
|
69
|
+
raise "sub-query must return a PMS::Proxy object (got #{sub.class})"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def apply_operator(op, doc_nums)
|
74
|
+
case op = op.to_s.downcase
|
75
|
+
when 'and'
|
76
|
+
@results &= doc_nums
|
77
|
+
when 'or'
|
78
|
+
@results |= doc_nums
|
79
|
+
when 'not'
|
80
|
+
@results -= doc_nums
|
81
|
+
else
|
82
|
+
raise ArgumentError, "invalid operator '#{op}'"
|
83
|
+
end
|
84
|
+
|
85
|
+
self # allow chaining!
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
class TokenProxy < Proxy
|
91
|
+
|
92
|
+
attr_reader :token, :results_with_positions
|
93
|
+
|
94
|
+
def initialize(pms, token)
|
95
|
+
super(pms)
|
96
|
+
|
97
|
+
@token = token
|
98
|
+
|
99
|
+
@results_with_positions = index.results_with_positions(token)
|
100
|
+
@results = @results_with_positions.keys
|
101
|
+
end
|
102
|
+
|
103
|
+
def near(token, distance = 1, order = false)
|
104
|
+
results1 = results_with_positions
|
105
|
+
results2 = index.results_with_positions(token)
|
106
|
+
|
107
|
+
doc_nums = results1.keys & results2.keys
|
108
|
+
|
109
|
+
# TODO: i'm sure this can be simplified...
|
110
|
+
doc_nums.delete_if { |doc_num|
|
111
|
+
positions = results2[doc_num]
|
112
|
+
|
113
|
+
!results1[doc_num].any? { |pos1|
|
114
|
+
positions.find { |pos2|
|
115
|
+
diff = pos2 - pos1
|
116
|
+
|
117
|
+
break if order && diff < 0
|
118
|
+
|
119
|
+
diff.abs <= distance
|
120
|
+
}
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
apply_operator('and', doc_nums)
|
125
|
+
end
|
126
|
+
|
127
|
+
def adjacent(token, distance = 1)
|
128
|
+
near(token, distance, true)
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|