keyword_prospector 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,69 @@
1
+ #
2
+ # (C) 2008 Los Angeles Times
3
+ #
4
+ require File.dirname(__FILE__) + '/spec_helper'
5
+ require 'hyperlink_strategy'
6
+
7
+ describe HyperlinkStrategy do
8
+ before :each do
9
+ @strategy = HyperlinkStrategy.new
10
+ end
11
+
12
+ it "Should create hyperlinks to the provided URL" do
13
+ @strategy.url="http://travel.latimes.com"
14
+
15
+ @strategy.decorate("Foo").should == "<a href=\"#{@strategy.url}\">Foo</a>"
16
+ end
17
+
18
+ it "should accept url and options in the constructor" do
19
+ tmp = HyperlinkStrategy.new(:url, :foo => :bar)
20
+ tmp.url.should == :url
21
+ tmp.options[:foo].should == :bar
22
+ end
23
+
24
+ it "should accept options to specify html attributes" do
25
+ @strategy.options = {:title => "foo title", :style => "hidden;"}
26
+ @strategy.url = 'foourl'
27
+
28
+ linked_text = @strategy.decorate("Foo")
29
+ linked_text.should match(%r{<a .*>Foo</a>})
30
+ linked_text.should match(%r{href="foourl"})
31
+ linked_text.should match(%r{title="foo title"})
32
+ linked_text.should match(%r{style="hidden;"})
33
+ end
34
+
35
+ describe "keywords" do
36
+ it "should allow setting and retrieving keywords" do
37
+ keywords = %w{a b c d e}
38
+ @strategy.keywords = keywords
39
+
40
+ @strategy.keywords.should == Set.new(keywords)
41
+ end
42
+
43
+ it "should allow comma-separated strings for setting" do
44
+ @strategy.keywords = "foo", "bar", "baz"
45
+ @strategy.keywords.should == Set.new(["foo", "bar", "baz"])
46
+ end
47
+
48
+ it "should allow a single string for setting" do
49
+ @strategy.keywords = "xyzzy"
50
+ @strategy.keywords.should == Set.new(["xyzzy"])
51
+ end
52
+ end
53
+
54
+ describe "add_keyword" do
55
+ it "should add keywords to empty set" do
56
+ @strategy.keywords.should == Set.new
57
+
58
+ @strategy.add_keyword("foo").keywords.should == Set.new("foo")
59
+ end
60
+
61
+ it "should add keywords to existing set" do
62
+ keywords = %w{foo, bar, baz}
63
+ @strategy.keywords = keywords
64
+
65
+ @strategy.add_keyword("xyzzy").keywords.should ==
66
+ Set.new(keywords + ["xyzzy"])
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,226 @@
1
+ #
2
+ # (C) 2008 Los Angeles Times
3
+ #
4
+ require File.dirname(__FILE__) + '/spec_helper'
5
+ require 'keyword_linker'
6
+
7
+ describe KeywordLinker do
8
+ before(:each) do
9
+ @kl = KeywordLinker.new
10
+ end
11
+
12
+ describe :add_url do
13
+ it "should accept a string (url) and a single keyword" do
14
+ @kl.add_url("url", "keyword")
15
+ end
16
+
17
+ it "should accept a string (url) and an array of keywords" do
18
+ @kl.add_url("url", ["keyword1", "keyword2"])
19
+ end
20
+
21
+ it "should accept options for html attributes" do
22
+ @kl.add_url("url", "keyword", :class => "awesome")
23
+
24
+ linked_text = @kl.link_text("keyword")
25
+
26
+ linked_text.should match(%r{^<a .*>keyword</a>$})
27
+ linked_text.should match(%r{href="url"})
28
+ linked_text.should match(%r{class="awesome"})
29
+ end
30
+ end
31
+
32
+ describe :link_text do
33
+ it "should init_tree if I forget to" do
34
+ @kl.add_url("url", "foo")
35
+
36
+ @kl.link_text("Is there a foo in the house?").should ==
37
+ "Is there a <a href=\"url\">foo</a> in the house?"
38
+ end
39
+
40
+ it "should return original text when there are no matches" do
41
+ @kl.add_url("url", "foo")
42
+ @kl.init_tree
43
+
44
+ orig_text = "Is there a bar in the house?"
45
+ @kl.link_text(orig_text).should == orig_text
46
+ end
47
+
48
+ it "should return linked text when URL's are provided with keyword" do
49
+ @kl.add_url("url", "foo")
50
+ @kl.init_tree
51
+
52
+ @kl.link_text("Is there a foo in the house?").should ==
53
+ "Is there a <a href=\"url\">foo</a> in the house?"
54
+ end
55
+
56
+ it "should return linked text when URL's are provided with keyword array" do
57
+ @kl.add_url("url", %w{foo bar baz})
58
+ @kl.init_tree
59
+
60
+ @kl.link_text("pool bar party").should == "pool <a href=\"url\">bar</a> party"
61
+ end
62
+
63
+ it "should link correctly at the beginning of the text" do
64
+ @kl.add_url("url", "foo")
65
+
66
+ @kl.link_text("foo is the word").should == '<a href="url">foo</a> is the word'
67
+ end
68
+
69
+ it "should link correctly at the end of the text" do
70
+ @kl.add_url("url", "foo")
71
+
72
+ @kl.link_text("the word is foo").should == 'the word is <a href="url">foo</a>'
73
+ end
74
+
75
+ it "should perform multiple links in the text" do
76
+ @kl.add_url("url1", "foo")
77
+ @kl.add_url("url2", "bar")
78
+
79
+ @kl.link_text("the foo and the bar are awesome").should ==
80
+ 'the <a href="url1">foo</a> and the <a href="url2">bar</a> are awesome'
81
+ end
82
+
83
+ it "should link only the first instance of each keyword" do
84
+ @kl.add_url("url", "foo")
85
+
86
+ @kl.link_text("foo, foo, or foo?").should == '<a href="url">foo</a>, foo, or foo?'
87
+ end
88
+
89
+ it "should link only the first instance of each keyword in separate text elements" do
90
+ @kl.add_url("url", "foo")
91
+
92
+ @kl.link_text("<i>foo</i>, <b>foo</b>, or <u>foo</u>?").should == '<i><a href="url">foo</a></i>, <b>foo</b>, or <u>foo</u>?'
93
+ end
94
+
95
+ it "should link only the first instance of each url" do
96
+ @kl.add_url("url", %w[foo bar baz])
97
+
98
+ @kl.link_text("bar, baz, or foo?").should == '<a href="url">bar</a>, baz, or foo?'
99
+ end
100
+
101
+ it "should link longest match in overlapping text" do
102
+ @kl.add_url("url", ["foo bar", "bar baz xyzzy"])
103
+
104
+ @kl.link_text("foo bar baz xyzzy").should == 'foo <a href="url">bar baz xyzzy</a>'
105
+ end
106
+
107
+ describe "with another KeywordLinker in the constructor" do
108
+ before(:each) do
109
+ @combo_linker = KeywordLinker.new(@kl)
110
+ end
111
+
112
+ it "should link keywords from both linkers" do
113
+ @kl.add_url("foourl", "foo")
114
+ @combo_linker.add_url("barurl", "bar")
115
+
116
+ @combo_linker.link_text("foo bar").should == '<a href="foourl">foo</a> <a href="barurl">bar</a>'
117
+ end
118
+
119
+ it "should prioritize its own keywords over the parent's keywords" do
120
+ @kl.add_url("foourl1", "foo")
121
+ @combo_linker.add_url("foourl2", "foo")
122
+
123
+ @combo_linker.link_text("foo").should == '<a href="foourl2">foo</a>'
124
+ end
125
+ end
126
+
127
+ describe "with an array of KeywordLinkers as parents in the constructor" do
128
+ before(:each) do
129
+ @kl2 = KeywordLinker.new
130
+ @combo_linker = KeywordLinker.new([@kl, @kl2])
131
+ end
132
+
133
+ it "should link keywords from all linkers" do
134
+ @kl.add_url("foourl", "foo")
135
+ @kl2.add_url("barurl", "bar")
136
+ @combo_linker.add_url("bazurl", "baz")
137
+
138
+ @combo_linker.link_text("foo bar baz").should == '<a href="foourl">foo</a> <a href="barurl">bar</a> <a href="bazurl">baz</a>'
139
+ end
140
+
141
+ it "should prioritize its own keywords over the parents' keywords" do
142
+ @kl.add_url("foourl1", "foo")
143
+ @kl2.add_url("foourl2", "foo")
144
+ @combo_linker.add_url("foourl3", "foo")
145
+
146
+ @combo_linker.link_text("foo").should == '<a href="foourl3">foo</a>'
147
+ end
148
+ end
149
+
150
+ describe "with an array of KeywordLinkers as lookups in the constructor" do
151
+ before(:each) do
152
+ @kl2 = KeywordLinker.new
153
+ @combo_linker = KeywordLinker.new([@kl, @kl2])
154
+ end
155
+
156
+ it "should link keywords from all lookups" do
157
+ @kl.add_url("foourl", "foo")
158
+ @kl2.add_url("barurl", "bar")
159
+
160
+ @combo_linker.link_text("foo bar baz").should == '<a href="foourl">foo</a> <a href="barurl">bar</a> baz'
161
+ end
162
+ end
163
+
164
+ describe "with an arbitrary lookup object in the constructor" do
165
+ it "should provide results from the lookup object" do
166
+ lookup = mock(Object)
167
+ lookup.should_receive(:process).with(:text).and_return([:result])
168
+ kl = KeywordLinker.new(lookup)
169
+ kl.process(:text).should == [:result]
170
+ end
171
+
172
+ it "should reject objects from the constructor if they don't have a process method" do
173
+ lookup = mock(Object)
174
+ lambda{KeywordLinker.new(nil, lookup)}.should raise_error(ArgumentError)
175
+ end
176
+ end
177
+
178
+ describe "with multiple level hierarchy" do
179
+ before(:each) do
180
+ @kl2 = KeywordLinker.new(@kl)
181
+ @kl3 = KeywordLinker.new(@kl2)
182
+ @combo_linker = KeywordLinker.new(@kl3)
183
+ end
184
+
185
+ it "should link keywords from all linkers" do
186
+ @kl.add_url("foourl", "foo")
187
+ @kl2.add_url("barurl", "bar")
188
+ @kl3.add_url("bazurl", "baz")
189
+ @combo_linker.add_url("xyzzyurl", "xyzzy")
190
+
191
+ @combo_linker.link_text("foo bar baz xyzzy").should == '<a href="foourl">foo</a> <a href="barurl">bar</a> <a href="bazurl">baz</a> <a href="xyzzyurl">xyzzy</a>'
192
+ end
193
+ end
194
+ end
195
+
196
+ describe "linking html text" do
197
+ it "should skip linking inside tag attributes" do
198
+ @kl.add_url("url", "foo")
199
+
200
+ @kl.link_text('<td title="another foo for you">foo</td>').should ==
201
+ '<td title="another foo for you"><a href="url">foo</a></td>'
202
+ end
203
+
204
+ it "should not link inside of <a></a> tags" do
205
+ @kl.add_url("url", "foo")
206
+
207
+ @kl.link_text('<a href="bar">baz foo bar</a> and foo').should ==
208
+ '<a href="bar">baz foo bar</a> and <a href="url">foo</a>'
209
+ end
210
+
211
+ it "shouldn't choke on bogus etags" do
212
+ @kl.add_url("url", "foo")
213
+
214
+ lambda{@kl.link_text('foo </i>')}.should_not raise_error
215
+ end
216
+ end
217
+
218
+ describe "blacklisting keywords" do
219
+ it "should stop linking of every occurrence of the keyword" do
220
+ @kl.add_url("url", "Los Angeles")
221
+ @kl.blacklist_keyword("Los Angeles Times")
222
+
223
+ @kl.link_text("Los Angeles Times Los Angeles Times").should == "Los Angeles Times Los Angeles Times"
224
+ end
225
+ end
226
+ end
@@ -0,0 +1,232 @@
1
+ #
2
+ # (C) 2008 Los Angeles Times
3
+ #
4
+ require File.dirname(__FILE__) + '/spec_helper'
5
+ require 'set'
6
+
7
+ describe KeywordProspector do
8
+ it "should match keywords in text, respecting word boundaries" do
9
+ dl = KeywordProspector.new
10
+
11
+ dl.add('test')
12
+ dl.add('fido')
13
+ dl.add('te')
14
+ dl.add('fi')
15
+ dl.add('dot')
16
+ dl.add('dots')
17
+ dl.add('sis')
18
+
19
+ dl.construct_fail
20
+
21
+ matches = []
22
+ dl.process('hello fido this is a test') {|x| matches << x}
23
+ matches.size.should == 2
24
+ (matches.collect{|match| match.keyword} & %w{test fido}).
25
+ size.should == 2
26
+ end
27
+
28
+ it "should give correct location for a single match within a string" do
29
+ dl = KeywordProspector.new(["foo"])
30
+
31
+ match = nil
32
+ dl.process("A foo and his money are soon parted") {|x| match = x}
33
+
34
+ match.start_idx.should == 2
35
+ match.end_idx.should == 5
36
+ end
37
+
38
+ it "should give correct location for the second match within a string" do
39
+ dl = KeywordProspector.new(["foo", "bar"])
40
+
41
+ match = []
42
+ dl.process("foo bar") {|x| match << x}
43
+
44
+ match
45
+ match[0].keyword.should == "foo"
46
+ match[0].start_idx.should == 0
47
+ match[0].end_idx.should == 3
48
+ match[1].keyword.should == "bar"
49
+ match[1].start_idx.should == 4
50
+ match[1].end_idx.should == 7
51
+ end
52
+
53
+ it "Should include information about where the match is present in the string" do
54
+ dl = KeywordProspector.new %w{foo oo bar baz xyzzy thud}
55
+
56
+ matches = {}
57
+
58
+ dl.process('foo, bar, xyzzy and also the baz') {|x| matches[x.keyword] = x}
59
+
60
+ matches["foo"].should_not be_nil
61
+ matches["bar"].should_not be_nil
62
+ matches["baz"].should_not be_nil
63
+ matches["xyzzy"].should_not be_nil
64
+
65
+ matches["foo"].start_idx.should == 0
66
+ matches["foo"].end_idx.should == 3
67
+ matches["bar"].start_idx.should == 5
68
+ matches["bar"].end_idx.should == 8
69
+ matches["xyzzy"].start_idx.should == 10
70
+ matches["xyzzy"].end_idx.should == 15
71
+ matches["baz"].start_idx.should == 29
72
+ matches["baz"].end_idx.should == 32
73
+ end
74
+
75
+ it "should match a single word to itself" do
76
+ dl = KeywordProspector.new(["foo"])
77
+ count = 0
78
+ dl.process("foo"){count += 1}
79
+ count.should == 1
80
+ end
81
+
82
+ it "should not match a single word to a different word" do
83
+ dl = KeywordProspector.new(["foo"])
84
+ count = 0
85
+ dl.process("bar"){count += 1}
86
+ count.should == 0
87
+ end
88
+
89
+ it "should call the block once for every match" do
90
+ dl = KeywordProspector.new(["foo"])
91
+ count = 0
92
+ dl.process("foo foo foo"){count += 1}
93
+ count.should == 3
94
+ end
95
+
96
+ it "Should get correct start and end matches with overlapping matches" do
97
+ keywords = ['Sling Blade', 'Blade Runner', 'foo', 'bar']
98
+ dl = KeywordProspector.new(keywords)
99
+ candidate = 'Sling Blade Runner foo bar'
100
+ matches = {}
101
+ dl.process(candidate) {|x| matches[x.keyword] = x}
102
+
103
+ keywords.each do |keyword|
104
+ matches[keyword].start_idx.should == candidate.index(keyword)
105
+ matches[keyword].end_idx.should == candidate.index(keyword) +
106
+ keyword.length
107
+ end
108
+ end
109
+
110
+ it "returns a sorted array of matches when a block is not given" do
111
+ keywords = %w{foo bar baz xyzzy thud}
112
+
113
+ dl = KeywordProspector.new(keywords)
114
+ results =
115
+ dl.process("The best metavariables are thud, xyzzy, and of course foo.")
116
+
117
+ results.class.should == Array
118
+
119
+ results.should == results.sort
120
+ end
121
+
122
+ it "filters out shorter matches multiple matches overlap" do
123
+ dl = KeywordProspector.new(["a b c", "c d", "e f", "f g h", "i j k l m",
124
+ "k l m n o p q"])
125
+
126
+ results = dl.process("a b c d, e f g h, i j k l m n o p q",
127
+ :filter_overlaps => true)
128
+
129
+ results.size.should == 3
130
+
131
+ results[0].keyword.should == 'a b c'
132
+ results[1].keyword.should == 'f g h'
133
+ results[2].keyword.should == 'k l m n o p q'
134
+ end
135
+
136
+ it "detects word chars" do
137
+ KeywordProspector.word_char?(?a).should be_true
138
+ KeywordProspector.word_char?(?k).should be_true
139
+ KeywordProspector.word_char?(?z).should be_true
140
+ KeywordProspector.word_char?(?A).should be_true
141
+ KeywordProspector.word_char?(?K).should be_true
142
+ KeywordProspector.word_char?(?Z).should be_true
143
+ KeywordProspector.word_char?(?0).should be_true
144
+ KeywordProspector.word_char?(?7).should be_true
145
+ KeywordProspector.word_char?(?9).should be_true
146
+ KeywordProspector.word_char?(?_).should be_true
147
+ end
148
+
149
+ it "detects non-word chars" do
150
+ KeywordProspector.word_char?(?-).should be_false
151
+ KeywordProspector.word_char?(?>).should be_false
152
+ KeywordProspector.word_char?(?<).should be_false
153
+ KeywordProspector.word_char?(?.).should be_false
154
+ KeywordProspector.word_char?(32).should be_false
155
+ KeywordProspector.word_char?(9).should be_false
156
+ end
157
+
158
+ it "word_delimiter? is opposite of word_char?" do
159
+ KeywordProspector.word_delimiter?(?.).should be_true
160
+ KeywordProspector.word_delimiter?(32).should be_true
161
+ KeywordProspector.word_delimiter?(?K).should be_false
162
+ end
163
+
164
+ describe "word boundary detection" do
165
+ before(:each) do
166
+ keywords = %w{foo bar baz xyzzy thud}
167
+ @dl = KeywordProspector.new(keywords)
168
+ end
169
+
170
+ describe "allows" do
171
+ it "matching at beginning of string" do
172
+ results = @dl.process("foo is the word")
173
+
174
+ results.size.should == 1
175
+ results[0].keyword.should == "foo"
176
+ end
177
+
178
+ it "matching at end of string" do
179
+ results = @dl.process("the word is bar")
180
+
181
+ results.size.should == 1
182
+ results[0].keyword.should == "bar"
183
+ end
184
+ end
185
+
186
+ describe "doesn't allow" do
187
+ it "matches not starting on a word boundary" do
188
+ results = @dl.process("topaz is a gem but tobaz is not")
189
+ results.size.should == 0
190
+ end
191
+
192
+ it "matches not ending on a word boundary" do
193
+ results = @dl.process("are you xyzzypated?")
194
+ results.size.should == 0
195
+ end
196
+
197
+ it "matches at the beginning of the string and not ending on a word boundary" do
198
+ results = @dl.process("fooby you too?")
199
+ results.size.should == 0
200
+ end
201
+
202
+ it "multiple candidate matches in various places" do
203
+ results = @dl.process("fooby barby bazby tofoo tobar tobaz ambazbafoo")
204
+ results.size.should == 0
205
+ end
206
+ end
207
+ end
208
+
209
+ describe "with decoration strategy objects" do
210
+ it "should read keywords from the object" do
211
+ strategy = Object.new
212
+ strategy.should_receive(:keywords).and_return(Set.new(%w{foo bar baz}))
213
+
214
+ dl = KeywordProspector.new([strategy])
215
+ end
216
+
217
+ it "should return strategy objects in results" do
218
+ strategy = Object.new
219
+ strategy.should_receive(:keywords).and_return(Set.new(%w{foo bar baz}))
220
+
221
+ dl = KeywordProspector.new([strategy])
222
+
223
+ results = dl.process("foo, bar, and baz")
224
+
225
+ results.size.should == 3
226
+
227
+ results.each do |result|
228
+ result.output.should == strategy
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,104 @@
1
+ #
2
+ # (C) 2008 Los Angeles Times
3
+ #
4
+ require File.dirname(__FILE__) + '/spec_helper'
5
+ require 'lookup_chain'
6
+ require 'match'
7
+
8
+ describe LookupChain do
9
+ before(:each) do
10
+ @dl1 = mock(Object, :process => :dummy_method)
11
+ @dl2 = mock(Object, :process => :dummy_method)
12
+ end
13
+
14
+ describe :initialize do
15
+ describe "should check all objects for a process method" do
16
+ it "when given an array of objects" do
17
+ lambda {LookupChain.new([@dl1])}.should_not raise_error
18
+ lambda {LookupChain.new([Object.new])}.should raise_error(ArgumentError)
19
+ end
20
+
21
+ it "when given multiple objects in constructor" do
22
+ lambda {LookupChain.new(@dl1, @dl2)}.should_not raise_error
23
+ lambda {LookupChain.new(@dl1, Object.new)}.should raise_error(ArgumentError)
24
+ end
25
+ end
26
+ end
27
+
28
+ describe :<< do
29
+ it "should check for a process method" do
30
+ lambda{LookupChain.new << @dl1}.should_not raise_error
31
+ lambda{LookupChain.new << Object.new}.should raise_error(ArgumentError)
32
+ end
33
+
34
+ it "should add to the end of the list of lookups" do
35
+ lc = LookupChain.new
36
+ lc.lookups.should == []
37
+
38
+ lc << @dl1
39
+ lc.lookups.should == [@dl1]
40
+
41
+ lc << @dl2
42
+ lc.lookups.should == [@dl1, @dl2]
43
+ end
44
+ end
45
+
46
+ describe :lookups do
47
+ it "should return an array of lookup objects assigned in constructor" do
48
+ LookupChain.new(@dl1, @dl2).lookups.should == [@dl1, @dl2]
49
+ LookupChain.new([@dl2, @dl1]).lookups.should == [@dl2, @dl1]
50
+ end
51
+ end
52
+
53
+ describe :process do
54
+ it "should call process on all child objects" do
55
+ @dl1.should_receive(:process).with(:text).and_return([])
56
+ @dl2.should_receive(:process).with(:text).and_return([])
57
+ lc = LookupChain.new(@dl1, @dl2)
58
+
59
+ lc.process(:text)
60
+ end
61
+
62
+ it "should return a sorted list of match objects from all lookups" do
63
+ match1 = Match.new("match1", 0, 3)
64
+ match2 = Match.new("match2", 5, 7)
65
+ match3 = Match.new("match3", 11, 13)
66
+ match4 = Match.new("match4", 19, 31)
67
+
68
+ @dl1.should_receive(:process).with(:text).and_return([match1, match4])
69
+ @dl2.should_receive(:process).with(:text).and_return([match2, match3])
70
+
71
+ lc = LookupChain.new(@dl1, @dl2)
72
+
73
+ lc.process(:text).should == [match1, match2, match3, match4]
74
+ end
75
+
76
+ it "should return the longest match when there are overlaps, regardless of priority order" do
77
+ match1 = Match.new("match1", 0, 3)
78
+ match2 = Match.new("match2", 1, 7)
79
+
80
+ @dl1.stub!(:process).with(:text).and_return([match1])
81
+ @dl2.stub!(:process).with(:text).and_return([match2])
82
+
83
+ lc = LookupChain.new(@dl1, @dl2)
84
+ lc.process(:text).should == [match2]
85
+
86
+ lc = LookupChain.new(@dl2, @dl1)
87
+ lc.process(:text).should == [match2]
88
+ end
89
+
90
+ it "should prioritize the first lookup object in the list when there are overlapping matches of equal length" do
91
+ match1 = Match.new("match1", 0, 3)
92
+ match2 = Match.new("match2", 1, 4)
93
+
94
+ @dl1.stub!(:process).with(:text).and_return([match1])
95
+ @dl2.stub!(:process).with(:text).and_return([match2])
96
+
97
+ lc = LookupChain.new(@dl1, @dl2)
98
+ lc.process(:text).should == [match1]
99
+
100
+ lc = LookupChain.new(@dl2, @dl1)
101
+ lc.process(:text).should == [match2]
102
+ end
103
+ end
104
+ end