pms 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +676 -0
- data/ChangeLog +5 -0
- data/README +71 -0
- data/Rakefile +26 -0
- data/lib/pms.rb +51 -0
- data/lib/pms/ext.rb +49 -0
- data/lib/pms/index.rb +112 -0
- data/lib/pms/proxy.rb +133 -0
- data/lib/pms/version.rb +27 -0
- data/spec/pms/ext_spec.rb +70 -0
- data/spec/pms/index_spec.rb +30 -0
- data/spec/pms/proxy_spec.rb +13 -0
- data/spec/pms_spec.rb +65 -0
- data/spec/spec_helper.rb +8 -0
- data/test_data/fox.txt +7 -0
- metadata +77 -0
data/ChangeLog
ADDED
data/README
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
= pms - Poor Man's Search
|
2
|
+
|
3
|
+
== VERSION
|
4
|
+
|
5
|
+
This documentation refers to pms version 0.0.1
|
6
|
+
|
7
|
+
|
8
|
+
== DESCRIPTION
|
9
|
+
|
10
|
+
Provides a simple searching facility for (nearly) arbitrary input. It allows
|
11
|
+
searching by Strings (exact match) or Regexps (full control, but slower).
|
12
|
+
Boolean operators AND, OR, and NOT are supported, as well as proximity
|
13
|
+
operators NEAR (with configurable distance) and ADJACENT (taking order into
|
14
|
+
account). You can chain operators and also group them into sub-queries.
|
15
|
+
|
16
|
+
Inspiration came from a discussion on ruby-talk, starting with message
|
17
|
+
ruby-talk:322014[http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/322014],
|
18
|
+
but PMS has a more document-centric attitude (everything that <tt>input#each</tt>
|
19
|
+
yields is considered a document of its own).
|
20
|
+
|
21
|
+
Example:
|
22
|
+
|
23
|
+
require 'pms/ext'
|
24
|
+
|
25
|
+
text = <<EOT
|
26
|
+
Hello world, how are you today? I said "Hello"
|
27
|
+
to the other guy but he would not answer although
|
28
|
+
all the world could hear me.
|
29
|
+
EOT
|
30
|
+
|
31
|
+
search = text.search('hello').near('world', 3)
|
32
|
+
|
33
|
+
p search.results
|
34
|
+
#=> [0]
|
35
|
+
|
36
|
+
p search.results_with_positions
|
37
|
+
#=> {0=>[0, 8]}
|
38
|
+
|
39
|
+
p search.matches
|
40
|
+
#=> ["Hello world, how are you today? I said \"Hello\"\n"]
|
41
|
+
|
42
|
+
You can find more examples in the <tt>spec/</tt> directory.
|
43
|
+
|
44
|
+
|
45
|
+
== LINKS
|
46
|
+
|
47
|
+
* <http://pms.rubyforge.org/>
|
48
|
+
* <http://rubyforge.org/projects/pms>
|
49
|
+
* <http://github.com/blackwinter/pms>
|
50
|
+
|
51
|
+
|
52
|
+
== AUTHORS
|
53
|
+
|
54
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
55
|
+
|
56
|
+
|
57
|
+
== LICENSE AND COPYRIGHT
|
58
|
+
|
59
|
+
Copyright (C) 2008 Jens Wille
|
60
|
+
|
61
|
+
pms is free software: you can redistribute it and/or modify it under the
|
62
|
+
terms of the GNU General Public License as published by the Free Software
|
63
|
+
Foundation, either version 3 of the License, or (at your option) any later
|
64
|
+
version.
|
65
|
+
|
66
|
+
pms is distributed in the hope that it will be useful, but WITHOUT ANY
|
67
|
+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
68
|
+
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
69
|
+
|
70
|
+
You should have received a copy of the GNU General Public License along with
|
71
|
+
pms. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require %q{lib/pms/version}
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'hen'
|
5
|
+
|
6
|
+
Hen.lay! {{
|
7
|
+
:rubyforge => {
|
8
|
+
:project => %q{pms},
|
9
|
+
:package => %q{pms},
|
10
|
+
:rdoc_dir => nil
|
11
|
+
},
|
12
|
+
|
13
|
+
:gem => {
|
14
|
+
:version => PMS::VERSION,
|
15
|
+
:summary => %q{Poor Man's Search},
|
16
|
+
:homepage => %q{http://pms.rubyforge.org/},
|
17
|
+
:files => FileList['lib/**/*.rb'].to_a,
|
18
|
+
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'test_data/**/*'].to_a,
|
19
|
+
:dependencies => %w[]
|
20
|
+
}
|
21
|
+
}}
|
22
|
+
rescue LoadError
|
23
|
+
abort "Please install the 'hen' gem first."
|
24
|
+
end
|
25
|
+
|
26
|
+
### Place your custom Rake tasks here.
|
data/lib/pms.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# pms - Poor Man's Search. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Jens Wille #
|
7
|
+
# #
|
8
|
+
# Authors: #
|
9
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
10
|
+
# #
|
11
|
+
# pms is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU General Public License as published by the Free Software #
|
13
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
14
|
+
# version. #
|
15
|
+
# #
|
16
|
+
# pms is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
19
|
+
# details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU General Public License along #
|
22
|
+
# with pms. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
require 'pms/index'
|
28
|
+
require 'pms/proxy'
|
29
|
+
|
30
|
+
class PMS
|
31
|
+
|
32
|
+
attr_reader :input, :index
|
33
|
+
|
34
|
+
def initialize(input)
|
35
|
+
@input = input
|
36
|
+
@index = Index.new(input)
|
37
|
+
end
|
38
|
+
|
39
|
+
def search(token = nil)
|
40
|
+
token ? TokenProxy.new(self, token) : Proxy.new(self).and { |*a| yield(*a) }
|
41
|
+
end
|
42
|
+
|
43
|
+
def results
|
44
|
+
@index.entries
|
45
|
+
end
|
46
|
+
|
47
|
+
def matches
|
48
|
+
@index.matches
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/lib/pms/ext.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of pms, Poor Man's Search. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Jens Wille #
|
7
|
+
# #
|
8
|
+
# Authors: #
|
9
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
10
|
+
# #
|
11
|
+
# pms is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU General Public License as published by the Free Software #
|
13
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
14
|
+
# version. #
|
15
|
+
# #
|
16
|
+
# pms is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
19
|
+
# details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU General Public License along #
|
22
|
+
# with pms. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
require 'pms'
|
28
|
+
|
29
|
+
module PMS::Ext
|
30
|
+
|
31
|
+
RECEIVERS = [String, IO, Array].freeze
|
32
|
+
|
33
|
+
def search(*args)
|
34
|
+
PMS.new(self).search(*args)
|
35
|
+
end
|
36
|
+
|
37
|
+
RECEIVERS.each { |klass|
|
38
|
+
klass.send(:include, self)
|
39
|
+
}
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
class File
|
44
|
+
|
45
|
+
def self.search(file, *args)
|
46
|
+
File.open(file.respond_to?(:path) ? file.path : file) { |f| f.search(*args) }
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
data/lib/pms/index.rb
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of pms, Poor Man's Search. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Jens Wille #
|
7
|
+
# #
|
8
|
+
# Authors: #
|
9
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
10
|
+
# #
|
11
|
+
# pms is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU General Public License as published by the Free Software #
|
13
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
14
|
+
# version. #
|
15
|
+
# #
|
16
|
+
# pms is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
19
|
+
# details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU General Public License along #
|
22
|
+
# with pms. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class PMS
|
28
|
+
|
29
|
+
class Index
|
30
|
+
|
31
|
+
TOKEN_RE = %r{\w+}o
|
32
|
+
|
33
|
+
attr_reader :input, :index, :entries
|
34
|
+
|
35
|
+
def initialize(input)
|
36
|
+
raise ArgumentError, "input must implement #each" unless input.respond_to?(:each)
|
37
|
+
|
38
|
+
@input = input
|
39
|
+
@index = Hash.new { |h, k| h[k] = Hash.new { |i, j| i[j] = [] } }
|
40
|
+
|
41
|
+
build_index
|
42
|
+
end
|
43
|
+
|
44
|
+
def doc_nums_with_positions(token)
|
45
|
+
case token
|
46
|
+
when String
|
47
|
+
index[mangle_token(token)]
|
48
|
+
when Regexp
|
49
|
+
res = {}
|
50
|
+
|
51
|
+
index.each { |key, value|
|
52
|
+
res.update(value) { |_, old, new| old | new } if key =~ token
|
53
|
+
}
|
54
|
+
|
55
|
+
res
|
56
|
+
else
|
57
|
+
raise TypeError, "expected String or Regexp, got #{token.class}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
alias_method :results_with_positions, :doc_nums_with_positions
|
62
|
+
|
63
|
+
def doc_nums(token)
|
64
|
+
doc_nums_with_positions(token).keys
|
65
|
+
end
|
66
|
+
|
67
|
+
alias_method :results, :doc_nums
|
68
|
+
|
69
|
+
def documents(doc_nums = default = Object.new)
|
70
|
+
@documents ||= get_documents
|
71
|
+
default ? @documents : doc_nums.map { |doc_num| @documents[doc_num] }
|
72
|
+
end
|
73
|
+
|
74
|
+
alias_method :matches, :documents
|
75
|
+
|
76
|
+
def doc(doc_num)
|
77
|
+
documents([doc_num]).first
|
78
|
+
end
|
79
|
+
|
80
|
+
alias_method :[], :doc
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def build_index
|
85
|
+
@documents, @entries = nil, []
|
86
|
+
doc_num = -1
|
87
|
+
|
88
|
+
input.each { |doc|
|
89
|
+
@entries << doc_num += 1
|
90
|
+
pos = -1
|
91
|
+
|
92
|
+
doc.scan(TOKEN_RE) { |token|
|
93
|
+
index[mangle_token(token)][doc_num] << pos += 1
|
94
|
+
}
|
95
|
+
}
|
96
|
+
end
|
97
|
+
|
98
|
+
def get_documents
|
99
|
+
input.rewind if input.respond_to?(:rewind)
|
100
|
+
|
101
|
+
docs = []
|
102
|
+
input.each { |doc| docs << doc }
|
103
|
+
docs
|
104
|
+
end
|
105
|
+
|
106
|
+
def mangle_token(token)
|
107
|
+
token.downcase
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
data/lib/pms/proxy.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of pms, Poor Man's Search. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Jens Wille #
|
7
|
+
# #
|
8
|
+
# Authors: #
|
9
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
10
|
+
# #
|
11
|
+
# pms is free software; you can redistribute it and/or modify it under the #
|
12
|
+
# terms of the GNU General Public License as published by the Free Software #
|
13
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
14
|
+
# version. #
|
15
|
+
# #
|
16
|
+
# pms is distributed in the hope that it will be useful, but WITHOUT ANY #
|
17
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
18
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
19
|
+
# details. #
|
20
|
+
# #
|
21
|
+
# You should have received a copy of the GNU General Public License along #
|
22
|
+
# with pms. If not, see <http://www.gnu.org/licenses/>. #
|
23
|
+
# #
|
24
|
+
###############################################################################
|
25
|
+
#++
|
26
|
+
|
27
|
+
class PMS
|
28
|
+
|
29
|
+
class Proxy
|
30
|
+
|
31
|
+
attr_reader :pms, :index, :results
|
32
|
+
|
33
|
+
def initialize(pms)
|
34
|
+
@pms = pms
|
35
|
+
@index = pms.index
|
36
|
+
@results = pms.results
|
37
|
+
end
|
38
|
+
|
39
|
+
def and(token = nil)
|
40
|
+
token ? apply_operator_with_token('and', token) :
|
41
|
+
apply_operator_with_block('and') { |*a| yield(*a) }
|
42
|
+
end
|
43
|
+
|
44
|
+
def or(token = nil)
|
45
|
+
token ? apply_operator_with_token('or', token) :
|
46
|
+
apply_operator_with_block('or') { |*a| yield(*a) }
|
47
|
+
end
|
48
|
+
|
49
|
+
def not(token = nil)
|
50
|
+
token ? apply_operator_with_token('not', token) :
|
51
|
+
apply_operator_with_block('not') { |*a| yield(*a) }
|
52
|
+
end
|
53
|
+
|
54
|
+
def matches
|
55
|
+
index.matches(results)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def apply_operator_with_token(op, token)
|
61
|
+
apply_operator(op, index.results(token))
|
62
|
+
end
|
63
|
+
|
64
|
+
def apply_operator_with_block(op)
|
65
|
+
case sub = yield(pms)
|
66
|
+
when Proxy
|
67
|
+
apply_operator(op, sub.results)
|
68
|
+
else
|
69
|
+
raise "sub-query must return a PMS::Proxy object (got #{sub.class})"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def apply_operator(op, doc_nums)
|
74
|
+
case op = op.to_s.downcase
|
75
|
+
when 'and'
|
76
|
+
@results &= doc_nums
|
77
|
+
when 'or'
|
78
|
+
@results |= doc_nums
|
79
|
+
when 'not'
|
80
|
+
@results -= doc_nums
|
81
|
+
else
|
82
|
+
raise ArgumentError, "invalid operator '#{op}'"
|
83
|
+
end
|
84
|
+
|
85
|
+
self # allow chaining!
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
class TokenProxy < Proxy
|
91
|
+
|
92
|
+
attr_reader :token, :results_with_positions
|
93
|
+
|
94
|
+
def initialize(pms, token)
|
95
|
+
super(pms)
|
96
|
+
|
97
|
+
@token = token
|
98
|
+
|
99
|
+
@results_with_positions = index.results_with_positions(token)
|
100
|
+
@results = @results_with_positions.keys
|
101
|
+
end
|
102
|
+
|
103
|
+
def near(token, distance = 1, order = false)
|
104
|
+
results1 = results_with_positions
|
105
|
+
results2 = index.results_with_positions(token)
|
106
|
+
|
107
|
+
doc_nums = results1.keys & results2.keys
|
108
|
+
|
109
|
+
# TODO: i'm sure this can be simplified...
|
110
|
+
doc_nums.delete_if { |doc_num|
|
111
|
+
positions = results2[doc_num]
|
112
|
+
|
113
|
+
!results1[doc_num].any? { |pos1|
|
114
|
+
positions.find { |pos2|
|
115
|
+
diff = pos2 - pos1
|
116
|
+
|
117
|
+
break if order && diff < 0
|
118
|
+
|
119
|
+
diff.abs <= distance
|
120
|
+
}
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
apply_operator('and', doc_nums)
|
125
|
+
end
|
126
|
+
|
127
|
+
def adjacent(token, distance = 1)
|
128
|
+
near(token, distance, true)
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|