keyword_prospector 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/License.txt +20 -0
- data/Manifest.txt +40 -0
- data/README.txt +95 -0
- data/Rakefile +4 -0
- data/config/hoe.rb +73 -0
- data/config/requirements.rb +15 -0
- data/lib/hyperlink_strategy.rb +56 -0
- data/lib/keyword_decorator.rb +7 -0
- data/lib/keyword_linker.rb +174 -0
- data/lib/keyword_prospector.rb +171 -0
- data/lib/lookup_chain.rb +65 -0
- data/lib/match.rb +37 -0
- data/lib/profile.rb +72 -0
- data/lib/search_and_replace.rb +45 -0
- data/lib/state.rb +85 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/spec/hyperlink_strategy_spec.rb +69 -0
- data/spec/keyword_linker_spec.rb +226 -0
- data/spec/keyword_prospector_spec.rb +232 -0
- data/spec/lookup_chain_spec.rb +104 -0
- data/spec/match_spec.rb +140 -0
- data/spec/search_and_replace_spec.rb +58 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/state_spec.rb +128 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +21 -0
- data/tasks/website.rake +17 -0
- data/website/index.html +141 -0
- data/website/index.txt +83 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.html.erb +48 -0
- metadata +107 -0
data/History.txt
ADDED
data/License.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 Los Angeles Times
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
History.txt
|
2
|
+
License.txt
|
3
|
+
Manifest.txt
|
4
|
+
README.txt
|
5
|
+
Rakefile
|
6
|
+
config/hoe.rb
|
7
|
+
config/requirements.rb
|
8
|
+
lib/hyperlink_strategy.rb
|
9
|
+
lib/keyword_decorator.rb
|
10
|
+
lib/keyword_linker.rb
|
11
|
+
lib/keyword_prospector
|
12
|
+
lib/keyword_prospector.rb
|
13
|
+
lib/lookup_chain.rb
|
14
|
+
lib/match.rb
|
15
|
+
lib/profile.rb
|
16
|
+
lib/search_and_replace.rb
|
17
|
+
lib/state.rb
|
18
|
+
script/console
|
19
|
+
script/destroy
|
20
|
+
script/generate
|
21
|
+
script/txt2html
|
22
|
+
setup.rb
|
23
|
+
spec/hyperlink_strategy_spec.rb
|
24
|
+
spec/keyword_linker_spec.rb
|
25
|
+
spec/keyword_prospector_spec.rb
|
26
|
+
spec/lookup_chain_spec.rb
|
27
|
+
spec/match_spec.rb
|
28
|
+
spec/search_and_replace_spec.rb
|
29
|
+
spec/spec.opts
|
30
|
+
spec/spec_helper.rb
|
31
|
+
spec/state_spec.rb
|
32
|
+
tasks/deployment.rake
|
33
|
+
tasks/environment.rake
|
34
|
+
tasks/rspec.rake
|
35
|
+
tasks/website.rake
|
36
|
+
website/index.html
|
37
|
+
website/index.txt
|
38
|
+
website/javascripts/rounded_corners_lite.inc.js
|
39
|
+
website/stylesheets/screen.css
|
40
|
+
website/template.html.erb
|
data/README.txt
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
= keyword_prospector
|
2
|
+
|
3
|
+
* http://github.com/latimes/keyword_prospector
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
KeywordProspector is a gem for associating keywords in text with arbitrary
|
8
|
+
output objects. It uses an Aho-Corasick tree to provide matching that scales
|
9
|
+
linearly with the length of the text, regardless of the number of keywords.
|
10
|
+
|
11
|
+
== FEATURES:
|
12
|
+
|
13
|
+
The core KeywordProspector engine has the following properties:
|
14
|
+
* Once a tree is built, matching against the tree is o(n) where n is the length
|
15
|
+
of your text, regardless of how mane keywords you have in the tree.
|
16
|
+
* Arbitrary output objects can be associated with any keyword or set of
|
17
|
+
keywords.
|
18
|
+
|
19
|
+
KeywordLinker can be used to create links to designated url's:
|
20
|
+
* You can specify a single keyword to associate with a url.
|
21
|
+
* You can specify an array of keywords to associate with a url.
|
22
|
+
* Each keyword or group of keywords will be linked only once to the url
|
23
|
+
provided.
|
24
|
+
* Hyperlinks are not created within existing hyperlinks.
|
25
|
+
* Hyperlinks are not generated anywhere they would be illegal in HTML, such as
|
26
|
+
within attribute values.
|
27
|
+
|
28
|
+
== SYNOPSIS:
|
29
|
+
|
30
|
+
Use KeywordLinker to create links in HTML text. KeywordLinker will link only
|
31
|
+
the first alternative that appears in text:
|
32
|
+
|
33
|
+
require 'keyword_linker'
|
34
|
+
|
35
|
+
linker = KeywordLinker.new
|
36
|
+
linker.add_url("http://www.latimes.com", ["L.A. Times", "Los Angeles Times"])
|
37
|
+
linker.link_text("'L.A. Times' or 'Los Angeles Times'?")
|
38
|
+
=> "'<a href=\"http://www.latimes.com\">L.A. Times</a>' or 'Los Angeles Times'?"
|
39
|
+
|
40
|
+
linker.link_text("'Los Angeles Times' or 'L.A. Times'?")
|
41
|
+
=> "'<a href=\"http://www.latimes.com\">Los Angeles Times</a>' or 'L.A. Times'?"
|
42
|
+
|
43
|
+
|
44
|
+
You can provide html options when adding url's:
|
45
|
+
|
46
|
+
linker = KeywordLinker.new
|
47
|
+
linker.add_url("http://www.latimes.com", "Los Angeles Times",
|
48
|
+
:title => "Visit the Los Angeles Times")
|
49
|
+
linker.add_url("http://www.google.com", ["Google", "The Google"],
|
50
|
+
:title => "Go check it out on The Google!")
|
51
|
+
|
52
|
+
linker.link_text("Do you prefer The Google or the Los Angeles Times?")
|
53
|
+
=> "Do you prefer <a href=\"http://www.google.com\" title=\"Go check it out on The Google!\">The Google</a> or the <a href=\"http://www.latimes.com\" title=\"Visit the Los Angeles Times\">Los Angeles Times</a>?"
|
54
|
+
|
55
|
+
== DEPENDENCIES:
|
56
|
+
|
57
|
+
* Hpricot
|
58
|
+
|
59
|
+
== INSTALL:
|
60
|
+
|
61
|
+
sudo gem install keyword_prospector
|
62
|
+
|
63
|
+
== AUTHOR:
|
64
|
+
|
65
|
+
Alf Mikula <amikula@gmail.com>
|
66
|
+
|
67
|
+
== ACKNOWLEDGEMENTS:
|
68
|
+
|
69
|
+
Thanks to David Johnson for telling me about the Aho-Corasick algorithm, and
|
70
|
+
for providing the original Aho-Corasick Ruby implementation.
|
71
|
+
|
72
|
+
== LICENSE:
|
73
|
+
|
74
|
+
(The MIT License)
|
75
|
+
|
76
|
+
Copyright (c) 2008 Los Angeles Times
|
77
|
+
|
78
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
79
|
+
a copy of this software and associated documentation files (the
|
80
|
+
'Software'), to deal in the Software without restriction, including
|
81
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
82
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
83
|
+
permit persons to whom the Software is furnished to do so, subject to
|
84
|
+
the following conditions:
|
85
|
+
|
86
|
+
The above copyright notice and this permission notice shall be
|
87
|
+
included in all copies or substantial portions of the Software.
|
88
|
+
|
89
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
90
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
91
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
92
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
93
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
94
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
95
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
data/config/hoe.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'keyword_prospector/version'
|
2
|
+
|
3
|
+
AUTHOR = 'FIXME full name' # can also be an array of Authors
|
4
|
+
EMAIL = "FIXME email"
|
5
|
+
DESCRIPTION = "description of gem"
|
6
|
+
GEM_NAME = 'keyword_prospector' # what ppl will type to install your gem
|
7
|
+
RUBYFORGE_PROJECT = 'latimes' # The unix name for your project
|
8
|
+
HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
|
9
|
+
DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
|
10
|
+
EXTRA_DEPENDENCIES = [
|
11
|
+
['hpricot', '>= 0.6']
|
12
|
+
] # An array of rubygem dependencies [name, version]
|
13
|
+
|
14
|
+
@config_file = "~/.rubyforge/user-config.yml"
|
15
|
+
@config = nil
|
16
|
+
RUBYFORGE_USERNAME = "unknown"
|
17
|
+
def rubyforge_username
|
18
|
+
unless @config
|
19
|
+
begin
|
20
|
+
@config = YAML.load(File.read(File.expand_path(@config_file)))
|
21
|
+
rescue
|
22
|
+
puts <<-EOS
|
23
|
+
ERROR: No rubyforge config file found: #{@config_file}
|
24
|
+
Run 'rubyforge setup' to prepare your env for access to Rubyforge
|
25
|
+
- See http://newgem.rubyforge.org/rubyforge.html for more details
|
26
|
+
EOS
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
end
|
30
|
+
RUBYFORGE_USERNAME.replace @config["username"]
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
REV = nil
|
35
|
+
# UNCOMMENT IF REQUIRED:
|
36
|
+
# REV = YAML.load(`svn info`)['Revision']
|
37
|
+
VERS = KeywordProspector::VERSION::STRING + (REV ? ".#{REV}" : "")
|
38
|
+
RDOC_OPTS = ['--quiet', '--title', 'keyword_prospector documentation',
|
39
|
+
"--opname", "index.html",
|
40
|
+
"--line-numbers",
|
41
|
+
"--main", "README",
|
42
|
+
"--inline-source"]
|
43
|
+
|
44
|
+
class Hoe
|
45
|
+
def extra_deps
|
46
|
+
@extra_deps.reject! { |x| Array(x).first == 'hoe' }
|
47
|
+
@extra_deps
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Generate all the Rake tasks
|
52
|
+
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
53
|
+
$hoe = Hoe.new(GEM_NAME, VERS) do |p|
|
54
|
+
p.developer(AUTHOR, EMAIL)
|
55
|
+
p.description = DESCRIPTION
|
56
|
+
p.summary = DESCRIPTION
|
57
|
+
p.url = HOMEPATH
|
58
|
+
p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
|
59
|
+
p.test_globs = ["test/**/test_*.rb"]
|
60
|
+
p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store'] #An array of file patterns to delete on clean.
|
61
|
+
|
62
|
+
# == Optional
|
63
|
+
p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
|
64
|
+
#p.extra_deps = EXTRA_DEPENDENCIES
|
65
|
+
|
66
|
+
#p.spec_extras = {} # A hash of extra values to set in the gemspec.
|
67
|
+
end
|
68
|
+
|
69
|
+
CHANGES = $hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
|
70
|
+
PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
|
71
|
+
$hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
|
72
|
+
$hoe.rsync_args = '-av --delete --ignore-errors'
|
73
|
+
$hoe.spec.post_install_message = File.open(File.dirname(__FILE__) + "/../PostInstall.txt").read rescue ""
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
include FileUtils
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
%w[rake hoe newgem rubigen].each do |req_gem|
|
6
|
+
begin
|
7
|
+
require req_gem
|
8
|
+
rescue LoadError
|
9
|
+
puts "This Rakefile requires the '#{req_gem}' RubyGem."
|
10
|
+
puts "Installation: gem install #{req_gem} -y"
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
$:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#
|
2
|
+
# (C) 2008 Los Angeles Times
|
3
|
+
#
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
class HyperlinkStrategy
|
7
|
+
attr_reader :url
|
8
|
+
attr_reader :options
|
9
|
+
|
10
|
+
def initialize(url=nil, options={})
|
11
|
+
@keywords = Set.new
|
12
|
+
self.options = options
|
13
|
+
self.url = url
|
14
|
+
end
|
15
|
+
|
16
|
+
def keywords=(*keywords)
|
17
|
+
@keywords = Set.new(keywords.flatten)
|
18
|
+
end
|
19
|
+
|
20
|
+
def keywords
|
21
|
+
@keywords
|
22
|
+
end
|
23
|
+
|
24
|
+
def url=(url)
|
25
|
+
@url = url
|
26
|
+
merge_options(@options)
|
27
|
+
end
|
28
|
+
|
29
|
+
def options=(options)
|
30
|
+
merge_options(options)
|
31
|
+
end
|
32
|
+
|
33
|
+
def add_keyword(keyword)
|
34
|
+
@keywords.add(keyword)
|
35
|
+
self
|
36
|
+
end
|
37
|
+
|
38
|
+
def decorate(keyword)
|
39
|
+
attributes = ""
|
40
|
+
options.each_pair do |key, value|
|
41
|
+
attributes += " " unless attributes.length == 0
|
42
|
+
attributes += "#{key}=\"#{value}\""
|
43
|
+
end
|
44
|
+
|
45
|
+
"<a " + attributes + ">#{keyword}</a>"
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def merge_options(options)
|
50
|
+
if @url
|
51
|
+
@options = {:href => @url}.merge(options)
|
52
|
+
else
|
53
|
+
@options = options
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
#
|
2
|
+
# (C) 2008 Los Angeles Times
|
3
|
+
#
|
4
|
+
require 'set'
|
5
|
+
require 'lookup_chain'
|
6
|
+
require 'keyword_prospector'
|
7
|
+
require 'hyperlink_strategy'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'hpricot'
|
10
|
+
|
11
|
+
#
|
12
|
+
# Given a set of keywords and url's, and optionally HTML attributes to set
|
13
|
+
# on links, takes text and adds hyperlinks from the specified keywords to
|
14
|
+
# their associated URL's. Example:
|
15
|
+
#
|
16
|
+
# linker = KeywordLinker.new
|
17
|
+
# linker.add_url('http://www.latimes.com', 'Los Angeles Times')
|
18
|
+
# linker.link_text("Let's check out the Los Angeles Times!")
|
19
|
+
# => "Let's check out the <a href=\"http://www.latimes.com\">Los Angeles Times</a>!"
|
20
|
+
#
|
21
|
+
# KeywordLinker depends on hpricot for parsing HTML. This is done to prevent
|
22
|
+
# hyperlinks from being added inside of other hyperlinks and inside of
|
23
|
+
# attribute text.
|
24
|
+
#
|
25
|
+
class KeywordLinker
|
26
|
+
@@blacklist_strategy = Object.new
|
27
|
+
class << @@blacklist_strategy
|
28
|
+
def decorate(keyword)
|
29
|
+
keyword
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Takes an optional array of lookup objects. A lookup object is anything
|
34
|
+
# that responds to the process method and returns an array of Match objects,
|
35
|
+
# including KeywordLinker, KeywordProspector, and LookupChain objects. If
|
36
|
+
# multiple objects are specified, a LookupChain is created that gives highest
|
37
|
+
# priority to matches from objects closer to the end of the array.
|
38
|
+
def initialize(*lookups)
|
39
|
+
@tree_initialized=true
|
40
|
+
|
41
|
+
if(lookups)
|
42
|
+
@lookup = LookupChain.new(lookups)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Takes a url and a keyword String or Array of keywords, and adds it to the
|
47
|
+
# tree of keywords in the KeywordLinker. Takes an optional hash of html
|
48
|
+
# attributes to be associated with this url.
|
49
|
+
#
|
50
|
+
# Only the first occurrence of the url will be linked. If multiple keywords
|
51
|
+
# are specified, then only the first occurrence of any of the keywords is
|
52
|
+
# linked to the target url. ie, if multiple keywords match for this url,
|
53
|
+
# only one instance of one keyword will be linked.
|
54
|
+
def add_url(url, keyword, html_attributes={})
|
55
|
+
init_lookup
|
56
|
+
|
57
|
+
strategy = HyperlinkStrategy.new(url, html_attributes)
|
58
|
+
strategy.keywords = keyword
|
59
|
+
|
60
|
+
@dl.add(strategy)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Blacklist this keyword or array of keywords. If a keyword is blacklisted,
|
64
|
+
# it will not be linked. For example, if the "Los Angeles" part of
|
65
|
+
# "Los Angeles Times" is getting linked, you can blacklist
|
66
|
+
# "Los Angeles Times" to keep it from being linked.
|
67
|
+
def blacklist_keyword(keyword)
|
68
|
+
init_lookup
|
69
|
+
|
70
|
+
@dl.add(keyword, @@blacklist_strategy)
|
71
|
+
end
|
72
|
+
|
73
|
+
# Initialize the tree after _all_ url's have been added. This needs to be
|
74
|
+
# called once. If you don't call init_tree, it will be called automatically
|
75
|
+
# on the first call to the process or link_text method. You may find this
|
76
|
+
# annoying or inconvenient if it happens on the first request to your
|
77
|
+
# application and you've constructed a large set of links. Adding url's
|
78
|
+
# after calling init_tree, process, or link_text is not supported.
|
79
|
+
def init_tree
|
80
|
+
unless @tree_initialized
|
81
|
+
@dl.construct_fail
|
82
|
+
@tree_initialized = true
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Adds links to known url's into the text provided. Only the first instance
|
87
|
+
# of each keyword or set of keywords associated to a url is linked. In cases
|
88
|
+
# of overlap, the longest keyword is chosen to resolve the overlap.
|
89
|
+
def link_text(text)
|
90
|
+
init_tree unless @tree_initialized
|
91
|
+
|
92
|
+
linked_outputs = Set.new
|
93
|
+
|
94
|
+
htext = Hpricot(text)
|
95
|
+
|
96
|
+
link_text_in_elem(htext, linked_outputs)
|
97
|
+
|
98
|
+
return htext.to_s
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns an array of matches in the specified text. Doesn't filter overlaps
|
102
|
+
# or parse HTML to prevent matches in attribute text or inside of existing
|
103
|
+
# hyperlinks. Primarily for internal use.
|
104
|
+
def process(text)
|
105
|
+
init_tree unless @tree_initialized
|
106
|
+
|
107
|
+
@lookup.process(text)
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
# Initialize the KeywordProspector object if needed. Called only when adding
|
112
|
+
# our own url's, not when aggregating other lookup objects.
|
113
|
+
def init_lookup
|
114
|
+
unless @dl
|
115
|
+
@dl = KeywordProspector.new
|
116
|
+
@tree_initialized = false
|
117
|
+
|
118
|
+
if @lookup
|
119
|
+
@lookup << @dl
|
120
|
+
else
|
121
|
+
@lookup = @dl
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Given a single hpricot element, link text inside of all child elements.
|
127
|
+
def link_text_in_elem(elem, linked_outputs)
|
128
|
+
elem.children.each do |e|
|
129
|
+
case e
|
130
|
+
when Hpricot::Text
|
131
|
+
text = e.to_s
|
132
|
+
|
133
|
+
results = process(text)
|
134
|
+
|
135
|
+
results.sort!
|
136
|
+
KeywordProspector.filter_overlaps(results)
|
137
|
+
|
138
|
+
unless (results.nil? || results.empty?)
|
139
|
+
e.content = link_text_internal(text, results, linked_outputs)
|
140
|
+
end
|
141
|
+
when Hpricot::Elem
|
142
|
+
link_text_in_elem(e, linked_outputs) if e.stag.name != "a"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Called internally to substitute links in element text.
|
148
|
+
def link_text_internal(text, results, linked_outputs = nil)
|
149
|
+
linked_outputs ||= Set.new
|
150
|
+
|
151
|
+
retval = ""
|
152
|
+
cursor = 0
|
153
|
+
results.each do |result|
|
154
|
+
unless linked_outputs.include?(result.output)
|
155
|
+
if(result.start_idx > cursor)
|
156
|
+
retval += text[cursor, result.start_idx - cursor]
|
157
|
+
cursor = result.start_idx
|
158
|
+
end
|
159
|
+
|
160
|
+
retval += result.output.decorate(result.keyword)
|
161
|
+
cursor = result.end_idx
|
162
|
+
|
163
|
+
linked_outputs.add(result.output)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
if(cursor < text.size)
|
168
|
+
retval += text[cursor, text.size-cursor]
|
169
|
+
end
|
170
|
+
|
171
|
+
return retval
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#
|
2
|
+
# (C) 2008 Los Angeles Times
|
3
|
+
#
|
4
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
5
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
6
|
+
|
7
|
+
require 'state'
|
8
|
+
require 'match'
|
9
|
+
|
10
|
+
class Position < Struct.new(:begin, :end); end
|
11
|
+
|
12
|
+
# KeywordProspector takes a collection of words, and optionally their
|
13
|
+
# associated outputs, and builds a match tree for running matches of the
|
14
|
+
# keywords against provided text. While construction of the Aho-Corasick
|
15
|
+
# tree takes a long time when there are many keywords, matching runs in time
|
16
|
+
# proportional to the length of the text provided. So, even if you have
|
17
|
+
# tens of thousands of keywords to match against, matching will still be
|
18
|
+
# very fast.
|
19
|
+
class KeywordProspector
|
20
|
+
# If words is provided, each word is added to the tree and the tree is
|
21
|
+
# initialized. Otherwise, call add for each word to place in the dictionary.
|
22
|
+
def initialize(words=nil)
|
23
|
+
@start = State.new(0, 0, [])
|
24
|
+
if(words)
|
25
|
+
words.each{|word| add word}
|
26
|
+
|
27
|
+
construct_fail
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Add an entry to the tree. The optional output parameter can be any object,
|
32
|
+
# and will be returned when this keyword is matched. If the entry has a
|
33
|
+
# _keywords_ method, it should return a collection of keywords. In this
|
34
|
+
# case, the output will be added for each keyword provided. If output is
|
35
|
+
# not provided, the entry is returned.
|
36
|
+
def add(entry, output=nil)
|
37
|
+
output ||= entry
|
38
|
+
|
39
|
+
if (entry.respond_to?(:keywords))
|
40
|
+
entry.keywords.each do |keyword|
|
41
|
+
add_internal(keyword, output)
|
42
|
+
end
|
43
|
+
else
|
44
|
+
add_internal(entry, output)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Call once after adding all entries. This constructs failure links in the
|
49
|
+
# tree, which allow the state machine to move a single step with every input
|
50
|
+
# character instead of backtracking back to the beginning of the tree when
|
51
|
+
# a partial match fails to match the next character.
|
52
|
+
def construct_fail
|
53
|
+
queue = Queue.new
|
54
|
+
@start.values.each do |value|
|
55
|
+
value.fail = @start
|
56
|
+
value.fail_increment = 1
|
57
|
+
queue.push value
|
58
|
+
end
|
59
|
+
|
60
|
+
prepare_root
|
61
|
+
|
62
|
+
while !queue.empty?
|
63
|
+
r = queue.pop
|
64
|
+
r.keys.each do |char|
|
65
|
+
s = r[char]
|
66
|
+
queue.push s
|
67
|
+
state = r.fail
|
68
|
+
increment = 0
|
69
|
+
while !state[char]
|
70
|
+
increment += state.fail_increment
|
71
|
+
state = state.fail
|
72
|
+
end
|
73
|
+
s.fail = state[char]
|
74
|
+
s.fail_increment = increment
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Process the provided text for matches. Returns an array of Match objects.
|
80
|
+
# Each Match object contains the keyword matched, the start and end position
|
81
|
+
# in the text, and the output object specified when the keyword was added
|
82
|
+
# to the search tree. The end position is actually the position of the
|
83
|
+
# character immediately following the end of the keyword, such that end
|
84
|
+
# position minus start position equals the length of the keyword string.
|
85
|
+
#
|
86
|
+
# Options:
|
87
|
+
# * :filter_overlaps - When multiple keywords overlap, filter out overlaps
|
88
|
+
# by choosing the longest match.
|
89
|
+
def process(bytes, options={})
|
90
|
+
retval = [] unless block_given?
|
91
|
+
state = @start
|
92
|
+
position = Position.new(0, 0)
|
93
|
+
bytes.each_byte do |a|
|
94
|
+
state = state.transition(a, position)
|
95
|
+
if state.keyword && ((position.begin == 0 ||
|
96
|
+
KeywordProspector.word_delimiter?(bytes[position.begin-1])) &&
|
97
|
+
(position.end == bytes.length ||
|
98
|
+
KeywordProspector.word_delimiter?(bytes[position.end])))
|
99
|
+
match = Match.new(state.keyword, position.begin, position.end, state.output)
|
100
|
+
|
101
|
+
# do something with the found item
|
102
|
+
if block_given?
|
103
|
+
yield match
|
104
|
+
else
|
105
|
+
retval << match
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
if retval
|
111
|
+
if (options[:filter_overlaps])
|
112
|
+
KeywordProspector.filter_overlaps(retval)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
return retval
|
117
|
+
end
|
118
|
+
|
119
|
+
# Filters overlaps from an array of results. If two results overlap, the
|
120
|
+
# shorter result is removed. If both results have the same length, the
|
121
|
+
# second result is removed.
|
122
|
+
def self.filter_overlaps(results)
|
123
|
+
i = 0
|
124
|
+
while (i < results.size-1)
|
125
|
+
a = results[i]
|
126
|
+
b = results[i+1]
|
127
|
+
if a.overlap?(b)
|
128
|
+
if (a.length < b.length)
|
129
|
+
results.delete_at(i)
|
130
|
+
else
|
131
|
+
results.delete_at(i+1)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
i += 1
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
WORD_CHARS=[?a..?z, ?A..?Z, ?0..?9, ?_]
|
140
|
+
|
141
|
+
# Returns true if the character provided is a word character.
|
142
|
+
def self.word_char?(char)
|
143
|
+
WORD_CHARS.each do |spec|
|
144
|
+
return true if spec === char
|
145
|
+
end
|
146
|
+
|
147
|
+
return false
|
148
|
+
end
|
149
|
+
|
150
|
+
# Returns true if the character provided is not a word character.
|
151
|
+
def self.word_delimiter?(char)
|
152
|
+
return !word_char?(char)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Add a single keyword to the tree.
|
156
|
+
def add_internal(keyword, output=nil)
|
157
|
+
cur_state = @start
|
158
|
+
# assuming a string here
|
159
|
+
keyword.each_byte {|c| cur_state = cur_state.insert_next_state(c)}
|
160
|
+
cur_state.keyword = keyword
|
161
|
+
cur_state.output = output
|
162
|
+
end
|
163
|
+
|
164
|
+
# Used internally to create links from root back to itself for all states
|
165
|
+
# that are not beginnings of known keywords.
|
166
|
+
def prepare_root
|
167
|
+
0.upto(255) do |i|
|
168
|
+
@start[i] = @start if !@start[i]
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|