keyword_prospector 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/License.txt +20 -0
- data/Manifest.txt +40 -0
- data/README.txt +95 -0
- data/Rakefile +4 -0
- data/config/hoe.rb +73 -0
- data/config/requirements.rb +15 -0
- data/lib/hyperlink_strategy.rb +56 -0
- data/lib/keyword_decorator.rb +7 -0
- data/lib/keyword_linker.rb +174 -0
- data/lib/keyword_prospector.rb +171 -0
- data/lib/lookup_chain.rb +65 -0
- data/lib/match.rb +37 -0
- data/lib/profile.rb +72 -0
- data/lib/search_and_replace.rb +45 -0
- data/lib/state.rb +85 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/spec/hyperlink_strategy_spec.rb +69 -0
- data/spec/keyword_linker_spec.rb +226 -0
- data/spec/keyword_prospector_spec.rb +232 -0
- data/spec/lookup_chain_spec.rb +104 -0
- data/spec/match_spec.rb +140 -0
- data/spec/search_and_replace_spec.rb +58 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/state_spec.rb +128 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +21 -0
- data/tasks/website.rake +17 -0
- data/website/index.html +141 -0
- data/website/index.txt +83 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.html.erb +48 -0
- metadata +107 -0
data/History.txt
ADDED
data/License.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2008 Los Angeles Times
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
History.txt
|
|
2
|
+
License.txt
|
|
3
|
+
Manifest.txt
|
|
4
|
+
README.txt
|
|
5
|
+
Rakefile
|
|
6
|
+
config/hoe.rb
|
|
7
|
+
config/requirements.rb
|
|
8
|
+
lib/hyperlink_strategy.rb
|
|
9
|
+
lib/keyword_decorator.rb
|
|
10
|
+
lib/keyword_linker.rb
|
|
11
|
+
lib/keyword_prospector
|
|
12
|
+
lib/keyword_prospector.rb
|
|
13
|
+
lib/lookup_chain.rb
|
|
14
|
+
lib/match.rb
|
|
15
|
+
lib/profile.rb
|
|
16
|
+
lib/search_and_replace.rb
|
|
17
|
+
lib/state.rb
|
|
18
|
+
script/console
|
|
19
|
+
script/destroy
|
|
20
|
+
script/generate
|
|
21
|
+
script/txt2html
|
|
22
|
+
setup.rb
|
|
23
|
+
spec/hyperlink_strategy_spec.rb
|
|
24
|
+
spec/keyword_linker_spec.rb
|
|
25
|
+
spec/keyword_prospector_spec.rb
|
|
26
|
+
spec/lookup_chain_spec.rb
|
|
27
|
+
spec/match_spec.rb
|
|
28
|
+
spec/search_and_replace_spec.rb
|
|
29
|
+
spec/spec.opts
|
|
30
|
+
spec/spec_helper.rb
|
|
31
|
+
spec/state_spec.rb
|
|
32
|
+
tasks/deployment.rake
|
|
33
|
+
tasks/environment.rake
|
|
34
|
+
tasks/rspec.rake
|
|
35
|
+
tasks/website.rake
|
|
36
|
+
website/index.html
|
|
37
|
+
website/index.txt
|
|
38
|
+
website/javascripts/rounded_corners_lite.inc.js
|
|
39
|
+
website/stylesheets/screen.css
|
|
40
|
+
website/template.html.erb
|
data/README.txt
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
= keyword_prospector
|
|
2
|
+
|
|
3
|
+
* http://github.com/latimes/keyword_prospector
|
|
4
|
+
|
|
5
|
+
== DESCRIPTION:
|
|
6
|
+
|
|
7
|
+
KeywordProspector is a gem for associating keywords in text with arbitrary
|
|
8
|
+
output objects. It uses an Aho-Corasick tree to provide matching that scales
|
|
9
|
+
linearly with the length of the text, regardless of the number of keywords.
|
|
10
|
+
|
|
11
|
+
== FEATURES:
|
|
12
|
+
|
|
13
|
+
The core KeywordProspector engine has the following properties:
|
|
14
|
+
* Once a tree is built, matching against the tree is o(n) where n is the length
|
|
15
|
+
of your text, regardless of how mane keywords you have in the tree.
|
|
16
|
+
* Arbitrary output objects can be associated with any keyword or set of
|
|
17
|
+
keywords.
|
|
18
|
+
|
|
19
|
+
KeywordLinker can be used to create links to designated url's:
|
|
20
|
+
* You can specify a single keyword to associate with a url.
|
|
21
|
+
* You can specify an array of keywords to associate with a url.
|
|
22
|
+
* Each keyword or group of keywords will be linked only once to the url
|
|
23
|
+
provided.
|
|
24
|
+
* Hyperlinks are not created within existing hyperlinks.
|
|
25
|
+
* Hyperlinks are not generated anywhere they would be illegal in HTML, such as
|
|
26
|
+
within attribute values.
|
|
27
|
+
|
|
28
|
+
== SYNOPSIS:
|
|
29
|
+
|
|
30
|
+
Use KeywordLinker to create links in HTML text. KeywordLinker will link only
|
|
31
|
+
the first alternative that appears in text:
|
|
32
|
+
|
|
33
|
+
require 'keyword_linker'
|
|
34
|
+
|
|
35
|
+
linker = KeywordLinker.new
|
|
36
|
+
linker.add_url("http://www.latimes.com", ["L.A. Times", "Los Angeles Times"])
|
|
37
|
+
linker.link_text("'L.A. Times' or 'Los Angeles Times'?")
|
|
38
|
+
=> "'<a href=\"http://www.latimes.com\">L.A. Times</a>' or 'Los Angeles Times'?"
|
|
39
|
+
|
|
40
|
+
linker.link_text("'Los Angeles Times' or 'L.A. Times'?")
|
|
41
|
+
=> "'<a href=\"http://www.latimes.com\">Los Angeles Times</a>' or 'L.A. Times'?"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
You can provide html options when adding url's:
|
|
45
|
+
|
|
46
|
+
linker = KeywordLinker.new
|
|
47
|
+
linker.add_url("http://www.latimes.com", "Los Angeles Times",
|
|
48
|
+
:title => "Visit the Los Angeles Times")
|
|
49
|
+
linker.add_url("http://www.google.com", ["Google", "The Google"],
|
|
50
|
+
:title => "Go check it out on The Google!")
|
|
51
|
+
|
|
52
|
+
linker.link_text("Do you prefer The Google or the Los Angeles Times?")
|
|
53
|
+
=> "Do you prefer <a href=\"http://www.google.com\" title=\"Go check it out on The Google!\">The Google</a> or the <a href=\"http://www.latimes.com\" title=\"Visit the Los Angeles Times\">Los Angeles Times</a>?"
|
|
54
|
+
|
|
55
|
+
== DEPENDENCIES:
|
|
56
|
+
|
|
57
|
+
* Hpricot
|
|
58
|
+
|
|
59
|
+
== INSTALL:
|
|
60
|
+
|
|
61
|
+
sudo gem install keyword_prospector
|
|
62
|
+
|
|
63
|
+
== AUTHOR:
|
|
64
|
+
|
|
65
|
+
Alf Mikula <amikula@gmail.com>
|
|
66
|
+
|
|
67
|
+
== ACKNOWLEDGEMENTS:
|
|
68
|
+
|
|
69
|
+
Thanks to David Johnson for telling me about the Aho-Corasick algorithm, and
|
|
70
|
+
for providing the original Aho-Corasick Ruby implementation.
|
|
71
|
+
|
|
72
|
+
== LICENSE:
|
|
73
|
+
|
|
74
|
+
(The MIT License)
|
|
75
|
+
|
|
76
|
+
Copyright (c) 2008 Los Angeles Times
|
|
77
|
+
|
|
78
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
79
|
+
a copy of this software and associated documentation files (the
|
|
80
|
+
'Software'), to deal in the Software without restriction, including
|
|
81
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
82
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
83
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
84
|
+
the following conditions:
|
|
85
|
+
|
|
86
|
+
The above copyright notice and this permission notice shall be
|
|
87
|
+
included in all copies or substantial portions of the Software.
|
|
88
|
+
|
|
89
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
|
90
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
91
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
92
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
93
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
94
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
95
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
data/config/hoe.rb
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
require 'keyword_prospector/version'
|
|
2
|
+
|
|
3
|
+
AUTHOR = 'FIXME full name' # can also be an array of Authors
|
|
4
|
+
EMAIL = "FIXME email"
|
|
5
|
+
DESCRIPTION = "description of gem"
|
|
6
|
+
GEM_NAME = 'keyword_prospector' # what ppl will type to install your gem
|
|
7
|
+
RUBYFORGE_PROJECT = 'latimes' # The unix name for your project
|
|
8
|
+
HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
|
|
9
|
+
DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
|
|
10
|
+
EXTRA_DEPENDENCIES = [
|
|
11
|
+
['hpricot', '>= 0.6']
|
|
12
|
+
] # An array of rubygem dependencies [name, version]
|
|
13
|
+
|
|
14
|
+
@config_file = "~/.rubyforge/user-config.yml"
|
|
15
|
+
@config = nil
|
|
16
|
+
RUBYFORGE_USERNAME = "unknown"
|
|
17
|
+
def rubyforge_username
|
|
18
|
+
unless @config
|
|
19
|
+
begin
|
|
20
|
+
@config = YAML.load(File.read(File.expand_path(@config_file)))
|
|
21
|
+
rescue
|
|
22
|
+
puts <<-EOS
|
|
23
|
+
ERROR: No rubyforge config file found: #{@config_file}
|
|
24
|
+
Run 'rubyforge setup' to prepare your env for access to Rubyforge
|
|
25
|
+
- See http://newgem.rubyforge.org/rubyforge.html for more details
|
|
26
|
+
EOS
|
|
27
|
+
exit
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
RUBYFORGE_USERNAME.replace @config["username"]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
REV = nil
|
|
35
|
+
# UNCOMMENT IF REQUIRED:
|
|
36
|
+
# REV = YAML.load(`svn info`)['Revision']
|
|
37
|
+
VERS = KeywordProspector::VERSION::STRING + (REV ? ".#{REV}" : "")
|
|
38
|
+
RDOC_OPTS = ['--quiet', '--title', 'keyword_prospector documentation',
|
|
39
|
+
"--opname", "index.html",
|
|
40
|
+
"--line-numbers",
|
|
41
|
+
"--main", "README",
|
|
42
|
+
"--inline-source"]
|
|
43
|
+
|
|
44
|
+
class Hoe
|
|
45
|
+
def extra_deps
|
|
46
|
+
@extra_deps.reject! { |x| Array(x).first == 'hoe' }
|
|
47
|
+
@extra_deps
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Generate all the Rake tasks
|
|
52
|
+
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
|
53
|
+
$hoe = Hoe.new(GEM_NAME, VERS) do |p|
|
|
54
|
+
p.developer(AUTHOR, EMAIL)
|
|
55
|
+
p.description = DESCRIPTION
|
|
56
|
+
p.summary = DESCRIPTION
|
|
57
|
+
p.url = HOMEPATH
|
|
58
|
+
p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
|
|
59
|
+
p.test_globs = ["test/**/test_*.rb"]
|
|
60
|
+
p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store'] #An array of file patterns to delete on clean.
|
|
61
|
+
|
|
62
|
+
# == Optional
|
|
63
|
+
p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
|
|
64
|
+
#p.extra_deps = EXTRA_DEPENDENCIES
|
|
65
|
+
|
|
66
|
+
#p.spec_extras = {} # A hash of extra values to set in the gemspec.
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
CHANGES = $hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
|
|
70
|
+
PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
|
|
71
|
+
$hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
|
|
72
|
+
$hoe.rsync_args = '-av --delete --ignore-errors'
|
|
73
|
+
$hoe.spec.post_install_message = File.open(File.dirname(__FILE__) + "/../PostInstall.txt").read rescue ""
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'fileutils'
|
|
2
|
+
include FileUtils
|
|
3
|
+
|
|
4
|
+
require 'rubygems'
|
|
5
|
+
%w[rake hoe newgem rubigen].each do |req_gem|
|
|
6
|
+
begin
|
|
7
|
+
require req_gem
|
|
8
|
+
rescue LoadError
|
|
9
|
+
puts "This Rakefile requires the '#{req_gem}' RubyGem."
|
|
10
|
+
puts "Installation: gem install #{req_gem} -y"
|
|
11
|
+
exit
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
$:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#
|
|
2
|
+
# (C) 2008 Los Angeles Times
|
|
3
|
+
#
|
|
4
|
+
require 'set'
|
|
5
|
+
|
|
6
|
+
class HyperlinkStrategy
|
|
7
|
+
attr_reader :url
|
|
8
|
+
attr_reader :options
|
|
9
|
+
|
|
10
|
+
def initialize(url=nil, options={})
|
|
11
|
+
@keywords = Set.new
|
|
12
|
+
self.options = options
|
|
13
|
+
self.url = url
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def keywords=(*keywords)
|
|
17
|
+
@keywords = Set.new(keywords.flatten)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def keywords
|
|
21
|
+
@keywords
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def url=(url)
|
|
25
|
+
@url = url
|
|
26
|
+
merge_options(@options)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def options=(options)
|
|
30
|
+
merge_options(options)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def add_keyword(keyword)
|
|
34
|
+
@keywords.add(keyword)
|
|
35
|
+
self
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def decorate(keyword)
|
|
39
|
+
attributes = ""
|
|
40
|
+
options.each_pair do |key, value|
|
|
41
|
+
attributes += " " unless attributes.length == 0
|
|
42
|
+
attributes += "#{key}=\"#{value}\""
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
"<a " + attributes + ">#{keyword}</a>"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
def merge_options(options)
|
|
50
|
+
if @url
|
|
51
|
+
@options = {:href => @url}.merge(options)
|
|
52
|
+
else
|
|
53
|
+
@options = options
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
#
|
|
2
|
+
# (C) 2008 Los Angeles Times
|
|
3
|
+
#
|
|
4
|
+
require 'set'
|
|
5
|
+
require 'lookup_chain'
|
|
6
|
+
require 'keyword_prospector'
|
|
7
|
+
require 'hyperlink_strategy'
|
|
8
|
+
require 'rubygems'
|
|
9
|
+
require 'hpricot'
|
|
10
|
+
|
|
11
|
+
#
|
|
12
|
+
# Given a set of keywords and url's, and optionally HTML attributes to set
|
|
13
|
+
# on links, takes text and adds hyperlinks from the specified keywords to
|
|
14
|
+
# their associated URL's. Example:
|
|
15
|
+
#
|
|
16
|
+
# linker = KeywordLinker.new
|
|
17
|
+
# linker.add_url('http://www.latimes.com', 'Los Angeles Times')
|
|
18
|
+
# linker.link_text("Let's check out the Los Angeles Times!")
|
|
19
|
+
# => "Let's check out the <a href=\"http://www.latimes.com\">Los Angeles Times</a>!"
|
|
20
|
+
#
|
|
21
|
+
# KeywordLinker depends on hpricot for parsing HTML. This is done to prevent
|
|
22
|
+
# hyperlinks from being added inside of other hyperlinks and inside of
|
|
23
|
+
# attribute text.
|
|
24
|
+
#
|
|
25
|
+
class KeywordLinker
|
|
26
|
+
@@blacklist_strategy = Object.new
|
|
27
|
+
class << @@blacklist_strategy
|
|
28
|
+
def decorate(keyword)
|
|
29
|
+
keyword
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Takes an optional array of lookup objects. A lookup object is anything
|
|
34
|
+
# that responds to the process method and returns an array of Match objects,
|
|
35
|
+
# including KeywordLinker, KeywordProspector, and LookupChain objects. If
|
|
36
|
+
# multiple objects are specified, a LookupChain is created that gives highest
|
|
37
|
+
# priority to matches from objects closer to the end of the array.
|
|
38
|
+
def initialize(*lookups)
|
|
39
|
+
@tree_initialized=true
|
|
40
|
+
|
|
41
|
+
if(lookups)
|
|
42
|
+
@lookup = LookupChain.new(lookups)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Takes a url and a keyword String or Array of keywords, and adds it to the
|
|
47
|
+
# tree of keywords in the KeywordLinker. Takes an optional hash of html
|
|
48
|
+
# attributes to be associated with this url.
|
|
49
|
+
#
|
|
50
|
+
# Only the first occurrence of the url will be linked. If multiple keywords
|
|
51
|
+
# are specified, then only the first occurrence of any of the keywords is
|
|
52
|
+
# linked to the target url. ie, if multiple keywords match for this url,
|
|
53
|
+
# only one instance of one keyword will be linked.
|
|
54
|
+
def add_url(url, keyword, html_attributes={})
|
|
55
|
+
init_lookup
|
|
56
|
+
|
|
57
|
+
strategy = HyperlinkStrategy.new(url, html_attributes)
|
|
58
|
+
strategy.keywords = keyword
|
|
59
|
+
|
|
60
|
+
@dl.add(strategy)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Blacklist this keyword or array of keywords. If a keyword is blacklisted,
|
|
64
|
+
# it will not be linked. For example, if the "Los Angeles" part of
|
|
65
|
+
# "Los Angeles Times" is getting linked, you can blacklist
|
|
66
|
+
# "Los Angeles Times" to keep it from being linked.
|
|
67
|
+
def blacklist_keyword(keyword)
|
|
68
|
+
init_lookup
|
|
69
|
+
|
|
70
|
+
@dl.add(keyword, @@blacklist_strategy)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Initialize the tree after _all_ url's have been added. This needs to be
|
|
74
|
+
# called once. If you don't call init_tree, it will be called automatically
|
|
75
|
+
# on the first call to the process or link_text method. You may find this
|
|
76
|
+
# annoying or inconvenient if it happens on the first request to your
|
|
77
|
+
# application and you've constructed a large set of links. Adding url's
|
|
78
|
+
# after calling init_tree, process, or link_text is not supported.
|
|
79
|
+
def init_tree
|
|
80
|
+
unless @tree_initialized
|
|
81
|
+
@dl.construct_fail
|
|
82
|
+
@tree_initialized = true
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Adds links to known url's into the text provided. Only the first instance
|
|
87
|
+
# of each keyword or set of keywords associated to a url is linked. In cases
|
|
88
|
+
# of overlap, the longest keyword is chosen to resolve the overlap.
|
|
89
|
+
def link_text(text)
|
|
90
|
+
init_tree unless @tree_initialized
|
|
91
|
+
|
|
92
|
+
linked_outputs = Set.new
|
|
93
|
+
|
|
94
|
+
htext = Hpricot(text)
|
|
95
|
+
|
|
96
|
+
link_text_in_elem(htext, linked_outputs)
|
|
97
|
+
|
|
98
|
+
return htext.to_s
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Returns an array of matches in the specified text. Doesn't filter overlaps
|
|
102
|
+
# or parse HTML to prevent matches in attribute text or inside of existing
|
|
103
|
+
# hyperlinks. Primarily for internal use.
|
|
104
|
+
def process(text)
|
|
105
|
+
init_tree unless @tree_initialized
|
|
106
|
+
|
|
107
|
+
@lookup.process(text)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
private
|
|
111
|
+
# Initialize the KeywordProspector object if needed. Called only when adding
|
|
112
|
+
# our own url's, not when aggregating other lookup objects.
|
|
113
|
+
def init_lookup
|
|
114
|
+
unless @dl
|
|
115
|
+
@dl = KeywordProspector.new
|
|
116
|
+
@tree_initialized = false
|
|
117
|
+
|
|
118
|
+
if @lookup
|
|
119
|
+
@lookup << @dl
|
|
120
|
+
else
|
|
121
|
+
@lookup = @dl
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Given a single hpricot element, link text inside of all child elements.
|
|
127
|
+
def link_text_in_elem(elem, linked_outputs)
|
|
128
|
+
elem.children.each do |e|
|
|
129
|
+
case e
|
|
130
|
+
when Hpricot::Text
|
|
131
|
+
text = e.to_s
|
|
132
|
+
|
|
133
|
+
results = process(text)
|
|
134
|
+
|
|
135
|
+
results.sort!
|
|
136
|
+
KeywordProspector.filter_overlaps(results)
|
|
137
|
+
|
|
138
|
+
unless (results.nil? || results.empty?)
|
|
139
|
+
e.content = link_text_internal(text, results, linked_outputs)
|
|
140
|
+
end
|
|
141
|
+
when Hpricot::Elem
|
|
142
|
+
link_text_in_elem(e, linked_outputs) if e.stag.name != "a"
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Called internally to substitute links in element text.
|
|
148
|
+
def link_text_internal(text, results, linked_outputs = nil)
|
|
149
|
+
linked_outputs ||= Set.new
|
|
150
|
+
|
|
151
|
+
retval = ""
|
|
152
|
+
cursor = 0
|
|
153
|
+
results.each do |result|
|
|
154
|
+
unless linked_outputs.include?(result.output)
|
|
155
|
+
if(result.start_idx > cursor)
|
|
156
|
+
retval += text[cursor, result.start_idx - cursor]
|
|
157
|
+
cursor = result.start_idx
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
retval += result.output.decorate(result.keyword)
|
|
161
|
+
cursor = result.end_idx
|
|
162
|
+
|
|
163
|
+
linked_outputs.add(result.output)
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
if(cursor < text.size)
|
|
168
|
+
retval += text[cursor, text.size-cursor]
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
return retval
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
#
|
|
2
|
+
# (C) 2008 Los Angeles Times
|
|
3
|
+
#
|
|
4
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
|
5
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
|
6
|
+
|
|
7
|
+
require 'state'
|
|
8
|
+
require 'match'
|
|
9
|
+
|
|
10
|
+
class Position < Struct.new(:begin, :end); end
|
|
11
|
+
|
|
12
|
+
# KeywordProspector takes a collection of words, and optionally their
|
|
13
|
+
# associated outputs, and builds a match tree for running matches of the
|
|
14
|
+
# keywords against provided text. While construction of the Aho-Corasick
|
|
15
|
+
# tree takes a long time when there are many keywords, matching runs in time
|
|
16
|
+
# proportional to the length of the text provided. So, even if you have
|
|
17
|
+
# tens of thousands of keywords to match against, matching will still be
|
|
18
|
+
# very fast.
|
|
19
|
+
class KeywordProspector
|
|
20
|
+
# If words is provided, each word is added to the tree and the tree is
|
|
21
|
+
# initialized. Otherwise, call add for each word to place in the dictionary.
|
|
22
|
+
def initialize(words=nil)
|
|
23
|
+
@start = State.new(0, 0, [])
|
|
24
|
+
if(words)
|
|
25
|
+
words.each{|word| add word}
|
|
26
|
+
|
|
27
|
+
construct_fail
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Add an entry to the tree. The optional output parameter can be any object,
|
|
32
|
+
# and will be returned when this keyword is matched. If the entry has a
|
|
33
|
+
# _keywords_ method, it should return a collection of keywords. In this
|
|
34
|
+
# case, the output will be added for each keyword provided. If output is
|
|
35
|
+
# not provided, the entry is returned.
|
|
36
|
+
def add(entry, output=nil)
|
|
37
|
+
output ||= entry
|
|
38
|
+
|
|
39
|
+
if (entry.respond_to?(:keywords))
|
|
40
|
+
entry.keywords.each do |keyword|
|
|
41
|
+
add_internal(keyword, output)
|
|
42
|
+
end
|
|
43
|
+
else
|
|
44
|
+
add_internal(entry, output)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Call once after adding all entries. This constructs failure links in the
|
|
49
|
+
# tree, which allow the state machine to move a single step with every input
|
|
50
|
+
# character instead of backtracking back to the beginning of the tree when
|
|
51
|
+
# a partial match fails to match the next character.
|
|
52
|
+
def construct_fail
|
|
53
|
+
queue = Queue.new
|
|
54
|
+
@start.values.each do |value|
|
|
55
|
+
value.fail = @start
|
|
56
|
+
value.fail_increment = 1
|
|
57
|
+
queue.push value
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
prepare_root
|
|
61
|
+
|
|
62
|
+
while !queue.empty?
|
|
63
|
+
r = queue.pop
|
|
64
|
+
r.keys.each do |char|
|
|
65
|
+
s = r[char]
|
|
66
|
+
queue.push s
|
|
67
|
+
state = r.fail
|
|
68
|
+
increment = 0
|
|
69
|
+
while !state[char]
|
|
70
|
+
increment += state.fail_increment
|
|
71
|
+
state = state.fail
|
|
72
|
+
end
|
|
73
|
+
s.fail = state[char]
|
|
74
|
+
s.fail_increment = increment
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Process the provided text for matches. Returns an array of Match objects.
|
|
80
|
+
# Each Match object contains the keyword matched, the start and end position
|
|
81
|
+
# in the text, and the output object specified when the keyword was added
|
|
82
|
+
# to the search tree. The end position is actually the position of the
|
|
83
|
+
# character immediately following the end of the keyword, such that end
|
|
84
|
+
# position minus start position equals the length of the keyword string.
|
|
85
|
+
#
|
|
86
|
+
# Options:
|
|
87
|
+
# * :filter_overlaps - When multiple keywords overlap, filter out overlaps
|
|
88
|
+
# by choosing the longest match.
|
|
89
|
+
def process(bytes, options={})
|
|
90
|
+
retval = [] unless block_given?
|
|
91
|
+
state = @start
|
|
92
|
+
position = Position.new(0, 0)
|
|
93
|
+
bytes.each_byte do |a|
|
|
94
|
+
state = state.transition(a, position)
|
|
95
|
+
if state.keyword && ((position.begin == 0 ||
|
|
96
|
+
KeywordProspector.word_delimiter?(bytes[position.begin-1])) &&
|
|
97
|
+
(position.end == bytes.length ||
|
|
98
|
+
KeywordProspector.word_delimiter?(bytes[position.end])))
|
|
99
|
+
match = Match.new(state.keyword, position.begin, position.end, state.output)
|
|
100
|
+
|
|
101
|
+
# do something with the found item
|
|
102
|
+
if block_given?
|
|
103
|
+
yield match
|
|
104
|
+
else
|
|
105
|
+
retval << match
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
if retval
|
|
111
|
+
if (options[:filter_overlaps])
|
|
112
|
+
KeywordProspector.filter_overlaps(retval)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
return retval
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Filters overlaps from an array of results. If two results overlap, the
|
|
120
|
+
# shorter result is removed. If both results have the same length, the
|
|
121
|
+
# second result is removed.
|
|
122
|
+
def self.filter_overlaps(results)
|
|
123
|
+
i = 0
|
|
124
|
+
while (i < results.size-1)
|
|
125
|
+
a = results[i]
|
|
126
|
+
b = results[i+1]
|
|
127
|
+
if a.overlap?(b)
|
|
128
|
+
if (a.length < b.length)
|
|
129
|
+
results.delete_at(i)
|
|
130
|
+
else
|
|
131
|
+
results.delete_at(i+1)
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
i += 1
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
private
|
|
139
|
+
WORD_CHARS=[?a..?z, ?A..?Z, ?0..?9, ?_]
|
|
140
|
+
|
|
141
|
+
# Returns true if the character provided is a word character.
|
|
142
|
+
def self.word_char?(char)
|
|
143
|
+
WORD_CHARS.each do |spec|
|
|
144
|
+
return true if spec === char
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
return false
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Returns true if the character provided is not a word character.
|
|
151
|
+
def self.word_delimiter?(char)
|
|
152
|
+
return !word_char?(char)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Add a single keyword to the tree.
|
|
156
|
+
def add_internal(keyword, output=nil)
|
|
157
|
+
cur_state = @start
|
|
158
|
+
# assuming a string here
|
|
159
|
+
keyword.each_byte {|c| cur_state = cur_state.insert_next_state(c)}
|
|
160
|
+
cur_state.keyword = keyword
|
|
161
|
+
cur_state.output = output
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Used internally to create links from root back to itself for all states
|
|
165
|
+
# that are not beginnings of known keywords.
|
|
166
|
+
def prepare_root
|
|
167
|
+
0.upto(255) do |i|
|
|
168
|
+
@start[i] = @start if !@start[i]
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|