pms 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog ADDED
@@ -0,0 +1,5 @@
1
+ = Revision history for pms
2
+
3
+ == 0.0.1 [2008-12-06]
4
+
5
+ * Birthday :-)
data/README ADDED
@@ -0,0 +1,71 @@
1
+ = pms - Poor Man's Search
2
+
3
+ == VERSION
4
+
5
+ This documentation refers to pms version 0.0.1
6
+
7
+
8
+ == DESCRIPTION
9
+
10
+ Provides a simple searching facility for (nearly) arbitrary input. It allows
11
+ searching by Strings (exact match) or Regexps (full control, but slower).
12
+ Boolean operators AND, OR, and NOT are supported, as well as proximity
13
+ operators NEAR (with configurable distance) and ADJACENT (taking order into
14
+ account). You can chain operators and also group them into sub-queries.
15
+
16
+ Inspiration came from a discussion on ruby-talk, starting with message
17
+ ruby-talk:322014[http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/322014],
18
+ but PMS has a more document-centric attitude (everything that <tt>input#each</tt>
19
+ yields is considered a document of its own).
20
+
21
+ Example:
22
+
23
+ require 'pms/ext'
24
+
25
+ text = <<EOT
26
+ Hello world, how are you today? I said "Hello"
27
+ to the other guy but he would not answer although
28
+ all the world could hear me.
29
+ EOT
30
+
31
+ search = text.search('hello').near('world', 3)
32
+
33
+ p search.results
34
+ #=> [0]
35
+
36
+ p search.results_with_positions
37
+ #=> {0=>[0, 8]}
38
+
39
+ p search.matches
40
+ #=> ["Hello world, how are you today? I said \"Hello\"\n"]
41
+
42
+ You can find more examples in the <tt>spec/</tt> directory.
43
+
44
+
45
+ == LINKS
46
+
47
+ * <http://pms.rubyforge.org/>
48
+ * <http://rubyforge.org/projects/pms>
49
+ * <http://github.com/blackwinter/pms>
50
+
51
+
52
+ == AUTHORS
53
+
54
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
55
+
56
+
57
+ == LICENSE AND COPYRIGHT
58
+
59
+ Copyright (C) 2008 Jens Wille
60
+
61
+ pms is free software: you can redistribute it and/or modify it under the
62
+ terms of the GNU General Public License as published by the Free Software
63
+ Foundation, either version 3 of the License, or (at your option) any later
64
+ version.
65
+
66
+ pms is distributed in the hope that it will be useful, but WITHOUT ANY
67
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
68
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
69
+
70
+ You should have received a copy of the GNU General Public License along with
71
+ pms. If not, see <http://www.gnu.org/licenses/>.
data/Rakefile ADDED
@@ -0,0 +1,26 @@
1
+ require %q{lib/pms/version}
2
+
3
+ begin
4
+ require 'hen'
5
+
6
+ Hen.lay! {{
7
+ :rubyforge => {
8
+ :project => %q{pms},
9
+ :package => %q{pms},
10
+ :rdoc_dir => nil
11
+ },
12
+
13
+ :gem => {
14
+ :version => PMS::VERSION,
15
+ :summary => %q{Poor Man's Search},
16
+ :homepage => %q{http://pms.rubyforge.org/},
17
+ :files => FileList['lib/**/*.rb'].to_a,
18
+ :extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'test_data/**/*'].to_a,
19
+ :dependencies => %w[]
20
+ }
21
+ }}
22
+ rescue LoadError
23
+ abort "Please install the 'hen' gem first."
24
+ end
25
+
26
+ ### Place your custom Rake tasks here.
data/lib/pms.rb ADDED
@@ -0,0 +1,51 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # pms - Poor Man's Search. #
5
+ # #
6
+ # Copyright (C) 2008 Jens Wille #
7
+ # #
8
+ # Authors: #
9
+ # Jens Wille <jens.wille@uni-koeln.de> #
10
+ # #
11
+ # pms is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU General Public License as published by the Free Software #
13
+ # Foundation; either version 3 of the License, or (at your option) any later #
14
+ # version. #
15
+ # #
16
+ # pms is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
19
+ # details. #
20
+ # #
21
+ # You should have received a copy of the GNU General Public License along #
22
+ # with pms. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ require 'pms/index'
28
+ require 'pms/proxy'
29
+
30
+ class PMS
31
+
32
+ attr_reader :input, :index
33
+
34
+ def initialize(input)
35
+ @input = input
36
+ @index = Index.new(input)
37
+ end
38
+
39
+ def search(token = nil)
40
+ token ? TokenProxy.new(self, token) : Proxy.new(self).and { |*a| yield(*a) }
41
+ end
42
+
43
+ def results
44
+ @index.entries
45
+ end
46
+
47
+ def matches
48
+ @index.matches
49
+ end
50
+
51
+ end
data/lib/pms/ext.rb ADDED
@@ -0,0 +1,49 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of pms, Poor Man's Search. #
5
+ # #
6
+ # Copyright (C) 2008 Jens Wille #
7
+ # #
8
+ # Authors: #
9
+ # Jens Wille <jens.wille@uni-koeln.de> #
10
+ # #
11
+ # pms is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU General Public License as published by the Free Software #
13
+ # Foundation; either version 3 of the License, or (at your option) any later #
14
+ # version. #
15
+ # #
16
+ # pms is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
19
+ # details. #
20
+ # #
21
+ # You should have received a copy of the GNU General Public License along #
22
+ # with pms. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ require 'pms'
28
+
29
+ module PMS::Ext
30
+
31
+ RECEIVERS = [String, IO, Array].freeze
32
+
33
+ def search(*args)
34
+ PMS.new(self).search(*args)
35
+ end
36
+
37
+ RECEIVERS.each { |klass|
38
+ klass.send(:include, self)
39
+ }
40
+
41
+ end
42
+
43
+ class File
44
+
45
+ def self.search(file, *args)
46
+ File.open(file.respond_to?(:path) ? file.path : file) { |f| f.search(*args) }
47
+ end
48
+
49
+ end
data/lib/pms/index.rb ADDED
@@ -0,0 +1,112 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of pms, Poor Man's Search. #
5
+ # #
6
+ # Copyright (C) 2008 Jens Wille #
7
+ # #
8
+ # Authors: #
9
+ # Jens Wille <jens.wille@uni-koeln.de> #
10
+ # #
11
+ # pms is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU General Public License as published by the Free Software #
13
+ # Foundation; either version 3 of the License, or (at your option) any later #
14
+ # version. #
15
+ # #
16
+ # pms is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
19
+ # details. #
20
+ # #
21
+ # You should have received a copy of the GNU General Public License along #
22
+ # with pms. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class PMS
28
+
29
+ class Index
30
+
31
+ TOKEN_RE = %r{\w+}o
32
+
33
+ attr_reader :input, :index, :entries
34
+
35
+ def initialize(input)
36
+ raise ArgumentError, "input must implement #each" unless input.respond_to?(:each)
37
+
38
+ @input = input
39
+ @index = Hash.new { |h, k| h[k] = Hash.new { |i, j| i[j] = [] } }
40
+
41
+ build_index
42
+ end
43
+
44
+ def doc_nums_with_positions(token)
45
+ case token
46
+ when String
47
+ index[mangle_token(token)]
48
+ when Regexp
49
+ res = {}
50
+
51
+ index.each { |key, value|
52
+ res.update(value) { |_, old, new| old | new } if key =~ token
53
+ }
54
+
55
+ res
56
+ else
57
+ raise TypeError, "expected String or Regexp, got #{token.class}"
58
+ end
59
+ end
60
+
61
+ alias_method :results_with_positions, :doc_nums_with_positions
62
+
63
+ def doc_nums(token)
64
+ doc_nums_with_positions(token).keys
65
+ end
66
+
67
+ alias_method :results, :doc_nums
68
+
69
+ def documents(doc_nums = default = Object.new)
70
+ @documents ||= get_documents
71
+ default ? @documents : doc_nums.map { |doc_num| @documents[doc_num] }
72
+ end
73
+
74
+ alias_method :matches, :documents
75
+
76
+ def doc(doc_num)
77
+ documents([doc_num]).first
78
+ end
79
+
80
+ alias_method :[], :doc
81
+
82
+ private
83
+
84
+ def build_index
85
+ @documents, @entries = nil, []
86
+ doc_num = -1
87
+
88
+ input.each { |doc|
89
+ @entries << doc_num += 1
90
+ pos = -1
91
+
92
+ doc.scan(TOKEN_RE) { |token|
93
+ index[mangle_token(token)][doc_num] << pos += 1
94
+ }
95
+ }
96
+ end
97
+
98
+ def get_documents
99
+ input.rewind if input.respond_to?(:rewind)
100
+
101
+ docs = []
102
+ input.each { |doc| docs << doc }
103
+ docs
104
+ end
105
+
106
+ def mangle_token(token)
107
+ token.downcase
108
+ end
109
+
110
+ end
111
+
112
+ end
data/lib/pms/proxy.rb ADDED
@@ -0,0 +1,133 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of pms, Poor Man's Search. #
5
+ # #
6
+ # Copyright (C) 2008 Jens Wille #
7
+ # #
8
+ # Authors: #
9
+ # Jens Wille <jens.wille@uni-koeln.de> #
10
+ # #
11
+ # pms is free software; you can redistribute it and/or modify it under the #
12
+ # terms of the GNU General Public License as published by the Free Software #
13
+ # Foundation; either version 3 of the License, or (at your option) any later #
14
+ # version. #
15
+ # #
16
+ # pms is distributed in the hope that it will be useful, but WITHOUT ANY #
17
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
18
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
19
+ # details. #
20
+ # #
21
+ # You should have received a copy of the GNU General Public License along #
22
+ # with pms. If not, see <http://www.gnu.org/licenses/>. #
23
+ # #
24
+ ###############################################################################
25
+ #++
26
+
27
+ class PMS
28
+
29
+ class Proxy
30
+
31
+ attr_reader :pms, :index, :results
32
+
33
+ def initialize(pms)
34
+ @pms = pms
35
+ @index = pms.index
36
+ @results = pms.results
37
+ end
38
+
39
+ def and(token = nil)
40
+ token ? apply_operator_with_token('and', token) :
41
+ apply_operator_with_block('and') { |*a| yield(*a) }
42
+ end
43
+
44
+ def or(token = nil)
45
+ token ? apply_operator_with_token('or', token) :
46
+ apply_operator_with_block('or') { |*a| yield(*a) }
47
+ end
48
+
49
+ def not(token = nil)
50
+ token ? apply_operator_with_token('not', token) :
51
+ apply_operator_with_block('not') { |*a| yield(*a) }
52
+ end
53
+
54
+ def matches
55
+ index.matches(results)
56
+ end
57
+
58
+ private
59
+
60
+ def apply_operator_with_token(op, token)
61
+ apply_operator(op, index.results(token))
62
+ end
63
+
64
+ def apply_operator_with_block(op)
65
+ case sub = yield(pms)
66
+ when Proxy
67
+ apply_operator(op, sub.results)
68
+ else
69
+ raise "sub-query must return a PMS::Proxy object (got #{sub.class})"
70
+ end
71
+ end
72
+
73
+ def apply_operator(op, doc_nums)
74
+ case op = op.to_s.downcase
75
+ when 'and'
76
+ @results &= doc_nums
77
+ when 'or'
78
+ @results |= doc_nums
79
+ when 'not'
80
+ @results -= doc_nums
81
+ else
82
+ raise ArgumentError, "invalid operator '#{op}'"
83
+ end
84
+
85
+ self # allow chaining!
86
+ end
87
+
88
+ end
89
+
90
+ class TokenProxy < Proxy
91
+
92
+ attr_reader :token, :results_with_positions
93
+
94
+ def initialize(pms, token)
95
+ super(pms)
96
+
97
+ @token = token
98
+
99
+ @results_with_positions = index.results_with_positions(token)
100
+ @results = @results_with_positions.keys
101
+ end
102
+
103
+ def near(token, distance = 1, order = false)
104
+ results1 = results_with_positions
105
+ results2 = index.results_with_positions(token)
106
+
107
+ doc_nums = results1.keys & results2.keys
108
+
109
+ # TODO: i'm sure this can be simplified...
110
+ doc_nums.delete_if { |doc_num|
111
+ positions = results2[doc_num]
112
+
113
+ !results1[doc_num].any? { |pos1|
114
+ positions.find { |pos2|
115
+ diff = pos2 - pos1
116
+
117
+ break if order && diff < 0
118
+
119
+ diff.abs <= distance
120
+ }
121
+ }
122
+ }
123
+
124
+ apply_operator('and', doc_nums)
125
+ end
126
+
127
+ def adjacent(token, distance = 1)
128
+ near(token, distance, true)
129
+ end
130
+
131
+ end
132
+
133
+ end