keyword_prospector 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/License.txt +20 -0
- data/Manifest.txt +40 -0
- data/README.txt +95 -0
- data/Rakefile +4 -0
- data/config/hoe.rb +73 -0
- data/config/requirements.rb +15 -0
- data/lib/hyperlink_strategy.rb +56 -0
- data/lib/keyword_decorator.rb +7 -0
- data/lib/keyword_linker.rb +174 -0
- data/lib/keyword_prospector.rb +171 -0
- data/lib/lookup_chain.rb +65 -0
- data/lib/match.rb +37 -0
- data/lib/profile.rb +72 -0
- data/lib/search_and_replace.rb +45 -0
- data/lib/state.rb +85 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/spec/hyperlink_strategy_spec.rb +69 -0
- data/spec/keyword_linker_spec.rb +226 -0
- data/spec/keyword_prospector_spec.rb +232 -0
- data/spec/lookup_chain_spec.rb +104 -0
- data/spec/match_spec.rb +140 -0
- data/spec/search_and_replace_spec.rb +58 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/state_spec.rb +128 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +21 -0
- data/tasks/website.rake +17 -0
- data/website/index.html +141 -0
- data/website/index.txt +83 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.html.erb +48 -0
- metadata +107 -0
data/lib/lookup_chain.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
#
|
2
|
+
# (C) 2008 Los Angeles Times
|
3
|
+
#
|
4
|
+
require 'enumerator'
|
5
|
+
|
6
|
+
# Manages a list of lookup objects, prioritizing the matches from the later
|
7
|
+
# objects over matches from previous objects in cases of exact collisions or
|
8
|
+
# cases where two matches have the same length. Otherwise, when there are
|
9
|
+
# overlaps, the longest match always wins.
|
10
|
+
class LookupChain
|
11
|
+
attr_reader :lookups
|
12
|
+
|
13
|
+
def initialize(*lookups)
|
14
|
+
@lookups = []
|
15
|
+
lookups.flatten!
|
16
|
+
|
17
|
+
lookups.each do |lookup|
|
18
|
+
check_lookup(lookup)
|
19
|
+
@lookups << lookup
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def <<(lookup)
|
24
|
+
check_lookup(lookup)
|
25
|
+
|
26
|
+
self.lookups << lookup
|
27
|
+
end
|
28
|
+
|
29
|
+
def process(text)
|
30
|
+
matches = []
|
31
|
+
|
32
|
+
@lookups.each do |lookup|
|
33
|
+
new_matches = lookup.process(text)
|
34
|
+
matches += new_matches
|
35
|
+
matches.sort!
|
36
|
+
i = 0
|
37
|
+
while(i+1 < matches.size)
|
38
|
+
(match1, match2) = matches[i,2]
|
39
|
+
|
40
|
+
if(match1.overlap?(match2))
|
41
|
+
if (match1.length > match2.length)
|
42
|
+
matches.delete_at(i+1)
|
43
|
+
elsif (match1.length < match2.length)
|
44
|
+
matches.delete_at(i)
|
45
|
+
else
|
46
|
+
if (new_matches.include?(match1))
|
47
|
+
matches.delete_at(i)
|
48
|
+
else
|
49
|
+
matches.delete_at(i+1)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
else
|
53
|
+
i += 1
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
return matches
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
def check_lookup(lookup)
|
63
|
+
raise ArgumentError.new("lookup objects must respond to process method") unless lookup.respond_to?(:process)
|
64
|
+
end
|
65
|
+
end
|
data/lib/match.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#
|
2
|
+
# (C) 2008 Los Angeles Times
|
3
|
+
#
|
4
|
+
class Match
|
5
|
+
include Comparable
|
6
|
+
attr_accessor :keyword, :start_idx, :end_idx, :output
|
7
|
+
|
8
|
+
def initialize(keyword, start_idx=0, end_idx=0, output=nil)
|
9
|
+
self.keyword = keyword
|
10
|
+
self.start_idx = start_idx
|
11
|
+
self.end_idx = end_idx
|
12
|
+
self.output = output
|
13
|
+
end
|
14
|
+
|
15
|
+
def <=>(other)
|
16
|
+
retval = start_idx <=> other.start_idx
|
17
|
+
if (retval == 0)
|
18
|
+
end_idx <=> other.end_idx
|
19
|
+
else
|
20
|
+
retval
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def overlap?(other)
|
25
|
+
if (start_idx == other.start_idx || end_idx == other.end_idx)
|
26
|
+
return true
|
27
|
+
elsif (start_idx < other.start_idx)
|
28
|
+
return end_idx > other.start_idx
|
29
|
+
elsif (start_idx > other.start_idx)
|
30
|
+
return other.end_idx > start_idx
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def length
|
35
|
+
end_idx - start_idx
|
36
|
+
end
|
37
|
+
end
|
data/lib/profile.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(__FILE__),'..','lib')
|
4
|
+
|
5
|
+
require 'keyword_prospector'
|
6
|
+
|
7
|
+
class Profiler
|
8
|
+
MATCHWORD_COUNT=2000
|
9
|
+
|
10
|
+
def self.get_tree(keyword_count)
|
11
|
+
words = get_random_words(keyword_count)
|
12
|
+
KeywordProspector.new(words)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.profile_tree_creation(keyword_count, options={})
|
16
|
+
total = 0
|
17
|
+
time_limit = options[:time_limit] || 0.01
|
18
|
+
|
19
|
+
start_time = Time.now
|
20
|
+
iterations = 0
|
21
|
+
while Time.now - start_time < time_limit
|
22
|
+
start=Time.now
|
23
|
+
get_tree(keyword_count)
|
24
|
+
total += Time.now - start
|
25
|
+
iterations += 1
|
26
|
+
end
|
27
|
+
|
28
|
+
puts "Average tree creation time over #{iterations} iterations with #{keyword_count} keywords: #{total/iterations.to_f}"
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.profile_keyword_matching(keyword_count, options={})
|
32
|
+
dl = get_tree(keyword_count)
|
33
|
+
time_limit = options[:time_limit] || 300
|
34
|
+
|
35
|
+
total = 0
|
36
|
+
total_matches = 0
|
37
|
+
start_time = Time.now
|
38
|
+
iterations = 0
|
39
|
+
words = get_random_words(options[:count] || MATCHWORD_COUNT)
|
40
|
+
text = words.join(" ")
|
41
|
+
while Time.now - start_time < time_limit
|
42
|
+
matches = 0
|
43
|
+
start = Time.now
|
44
|
+
dl.process(text) {matches += 1}
|
45
|
+
total += Time.now - start
|
46
|
+
total_matches += matches
|
47
|
+
iterations += 1
|
48
|
+
end
|
49
|
+
|
50
|
+
puts "Average time spent matching: #{total/iterations.to_f}"
|
51
|
+
puts "Average number of matches: #{total_matches/iterations.to_f}"
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
def self.get_random_words(count=1000)
|
56
|
+
words = []
|
57
|
+
dict = File.open("/usr/share/dict/words")
|
58
|
+
count.times do |index|
|
59
|
+
words[index] = dict.readline.chomp
|
60
|
+
end
|
61
|
+
|
62
|
+
dict.each_line do |word|
|
63
|
+
word = dict.readline.chomp
|
64
|
+
index = rand dict.lineno
|
65
|
+
if (index < count)
|
66
|
+
words[index] = word
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
return words
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
#
|
2
|
+
# (C) 2008 Los Angeles Times
|
3
|
+
#
|
4
|
+
require 'keyword_prospector'
|
5
|
+
|
6
|
+
class SearchAndReplace
|
7
|
+
def initialize
|
8
|
+
@dl = KeywordProspector.new
|
9
|
+
@tree_initialized = false
|
10
|
+
end
|
11
|
+
|
12
|
+
def add_replacement(original, replacement)
|
13
|
+
@dl.add(original, replacement)
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize_tree
|
17
|
+
unless @tree_initialized
|
18
|
+
@dl.construct_fail
|
19
|
+
@tree_initialized = true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def replace_text(text)
|
24
|
+
unless @tree_initialized
|
25
|
+
initialize_tree
|
26
|
+
end
|
27
|
+
|
28
|
+
matches = @dl.process(text.downcase, :filter_overlaps => true)
|
29
|
+
|
30
|
+
retval = ""
|
31
|
+
current_index = 0
|
32
|
+
matches.each do |match|
|
33
|
+
if (current_index < match.start_idx)
|
34
|
+
retval += text[current_index, match.start_idx - current_index]
|
35
|
+
end
|
36
|
+
|
37
|
+
retval += match.output
|
38
|
+
current_index = match.end_idx
|
39
|
+
end
|
40
|
+
|
41
|
+
retval += text[current_index, text.length] unless current_index == text.length
|
42
|
+
|
43
|
+
retval
|
44
|
+
end
|
45
|
+
end
|
data/lib/state.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
#
|
2
|
+
# (C) 2008 Los Angeles Times
|
3
|
+
#
|
4
|
+
|
5
|
+
class State
|
6
|
+
@@id_sequence = 1
|
7
|
+
|
8
|
+
attr_reader :id
|
9
|
+
attr_reader :char
|
10
|
+
|
11
|
+
attr_accessor :fail
|
12
|
+
attr_accessor :fail_increment
|
13
|
+
attr_accessor :output
|
14
|
+
attr_accessor :depth
|
15
|
+
attr_accessor :keyword
|
16
|
+
|
17
|
+
def initialize(char, depth=0, next_state={})
|
18
|
+
if next_state.is_a?(Array)
|
19
|
+
class << next_state
|
20
|
+
def values
|
21
|
+
self.select{|value| value}
|
22
|
+
end
|
23
|
+
|
24
|
+
def keys
|
25
|
+
retval = []
|
26
|
+
self.each_index{|i| retval << i if self[i]}
|
27
|
+
|
28
|
+
retval
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
@char = char
|
34
|
+
@id = @@id_sequence
|
35
|
+
@next_state = next_state
|
36
|
+
@@id_sequence += 1
|
37
|
+
self.depth = depth
|
38
|
+
end
|
39
|
+
|
40
|
+
def insert_next_state(char)
|
41
|
+
cur_state = @next_state[char]
|
42
|
+
|
43
|
+
if cur_state
|
44
|
+
cur_state
|
45
|
+
else
|
46
|
+
@next_state[char] = State.new(char, depth + 1)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def [](char)
|
51
|
+
@next_state[char]
|
52
|
+
end
|
53
|
+
|
54
|
+
def []=(char, value)
|
55
|
+
@next_state[char] = value
|
56
|
+
end
|
57
|
+
|
58
|
+
def values
|
59
|
+
@next_state.values
|
60
|
+
end
|
61
|
+
|
62
|
+
def keys
|
63
|
+
@next_state.keys
|
64
|
+
end
|
65
|
+
|
66
|
+
def transition(char, position=nil)
|
67
|
+
next_state = @next_state[char]
|
68
|
+
|
69
|
+
if next_state == self
|
70
|
+
if position
|
71
|
+
position.end = position.begin += 1
|
72
|
+
end
|
73
|
+
elsif next_state
|
74
|
+
position.end += 1 if position
|
75
|
+
else
|
76
|
+
next_state = fail.transition(char)
|
77
|
+
if position
|
78
|
+
position.begin += depth - (next_state.depth - 1)
|
79
|
+
position.end += 1
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
next_state
|
84
|
+
end
|
85
|
+
end
|
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/keyword_prospector.rb'}"
|
9
|
+
puts "Loading keyword_prospector gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
data/script/txt2html
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
GEM_NAME = 'keyword_prospector' # what ppl will type to install your gem
|
4
|
+
RUBYFORGE_PROJECT = 'keyword_prospector'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
begin
|
8
|
+
require 'newgem'
|
9
|
+
require 'rubyforge'
|
10
|
+
rescue LoadError
|
11
|
+
puts "\n\nGenerating the website requires the newgem RubyGem"
|
12
|
+
puts "Install: gem install newgem\n\n"
|
13
|
+
exit(1)
|
14
|
+
end
|
15
|
+
require 'redcloth'
|
16
|
+
require 'syntax/convertors/html'
|
17
|
+
require 'erb'
|
18
|
+
require File.dirname(__FILE__) + "/../lib/#{GEM_NAME}/version.rb"
|
19
|
+
|
20
|
+
version = KeywordProspector::VERSION::STRING
|
21
|
+
download = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
|
22
|
+
|
23
|
+
def rubyforge_project_id
|
24
|
+
RubyForge.new.autoconfig["group_ids"][RUBYFORGE_PROJECT]
|
25
|
+
end
|
26
|
+
|
27
|
+
class Fixnum
|
28
|
+
def ordinal
|
29
|
+
# teens
|
30
|
+
return 'th' if (10..19).include?(self % 100)
|
31
|
+
# others
|
32
|
+
case self % 10
|
33
|
+
when 1: return 'st'
|
34
|
+
when 2: return 'nd'
|
35
|
+
when 3: return 'rd'
|
36
|
+
else return 'th'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class Time
|
42
|
+
def pretty
|
43
|
+
return "#{mday}#{mday.ordinal} #{strftime('%B')} #{year}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def convert_syntax(syntax, source)
|
48
|
+
return Syntax::Convertors::HTML.for_syntax(syntax).convert(source).gsub(%r!^<pre>|</pre>$!,'')
|
49
|
+
end
|
50
|
+
|
51
|
+
if ARGV.length >= 1
|
52
|
+
src, template = ARGV
|
53
|
+
template ||= File.join(File.dirname(__FILE__), '/../website/template.html.erb')
|
54
|
+
else
|
55
|
+
puts("Usage: #{File.split($0).last} source.txt [template.html.erb] > output.html")
|
56
|
+
exit!
|
57
|
+
end
|
58
|
+
|
59
|
+
template = ERB.new(File.open(template).read)
|
60
|
+
|
61
|
+
title = nil
|
62
|
+
body = nil
|
63
|
+
File.open(src) do |fsrc|
|
64
|
+
title_text = fsrc.readline
|
65
|
+
body_text_template = fsrc.read
|
66
|
+
body_text = ERB.new(body_text_template).result(binding)
|
67
|
+
syntax_items = []
|
68
|
+
body_text.gsub!(%r!<(pre|code)[^>]*?syntax=['"]([^'"]+)[^>]*>(.*?)</\1>!m){
|
69
|
+
ident = syntax_items.length
|
70
|
+
element, syntax, source = $1, $2, $3
|
71
|
+
syntax_items << "<#{element} class='syntax'>#{convert_syntax(syntax, source)}</#{element}>"
|
72
|
+
"syntax-temp-#{ident}"
|
73
|
+
}
|
74
|
+
title = RedCloth.new(title_text).to_html.gsub(%r!<.*?>!,'').strip
|
75
|
+
body = RedCloth.new(body_text).to_html
|
76
|
+
body.gsub!(%r!(?:<pre><code>)?syntax-temp-(\d+)(?:</code></pre>)?!){ syntax_items[$1.to_i] }
|
77
|
+
end
|
78
|
+
stat = File.stat(src)
|
79
|
+
created = stat.ctime
|
80
|
+
modified = stat.mtime
|
81
|
+
|
82
|
+
$stdout << template.result(binding)
|