keyword_prospector 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,65 @@
1
+ #
2
+ # (C) 2008 Los Angeles Times
3
+ #
4
+ require 'enumerator'
5
+
6
+ # Manages a list of lookup objects, prioritizing the matches from the later
7
+ # objects over matches from previous objects in cases of exact collisions or
8
+ # cases where two matches have the same length. Otherwise, when there are
9
+ # overlaps, the longest match always wins.
10
+ class LookupChain
11
+ attr_reader :lookups
12
+
13
+ def initialize(*lookups)
14
+ @lookups = []
15
+ lookups.flatten!
16
+
17
+ lookups.each do |lookup|
18
+ check_lookup(lookup)
19
+ @lookups << lookup
20
+ end
21
+ end
22
+
23
+ def <<(lookup)
24
+ check_lookup(lookup)
25
+
26
+ self.lookups << lookup
27
+ end
28
+
29
+ def process(text)
30
+ matches = []
31
+
32
+ @lookups.each do |lookup|
33
+ new_matches = lookup.process(text)
34
+ matches += new_matches
35
+ matches.sort!
36
+ i = 0
37
+ while(i+1 < matches.size)
38
+ (match1, match2) = matches[i,2]
39
+
40
+ if(match1.overlap?(match2))
41
+ if (match1.length > match2.length)
42
+ matches.delete_at(i+1)
43
+ elsif (match1.length < match2.length)
44
+ matches.delete_at(i)
45
+ else
46
+ if (new_matches.include?(match1))
47
+ matches.delete_at(i)
48
+ else
49
+ matches.delete_at(i+1)
50
+ end
51
+ end
52
+ else
53
+ i += 1
54
+ end
55
+ end
56
+ end
57
+
58
+ return matches
59
+ end
60
+
61
+ private
62
+ def check_lookup(lookup)
63
+ raise ArgumentError.new("lookup objects must respond to process method") unless lookup.respond_to?(:process)
64
+ end
65
+ end
data/lib/match.rb ADDED
@@ -0,0 +1,37 @@
1
+ #
2
+ # (C) 2008 Los Angeles Times
3
+ #
4
+ class Match
5
+ include Comparable
6
+ attr_accessor :keyword, :start_idx, :end_idx, :output
7
+
8
+ def initialize(keyword, start_idx=0, end_idx=0, output=nil)
9
+ self.keyword = keyword
10
+ self.start_idx = start_idx
11
+ self.end_idx = end_idx
12
+ self.output = output
13
+ end
14
+
15
+ def <=>(other)
16
+ retval = start_idx <=> other.start_idx
17
+ if (retval == 0)
18
+ end_idx <=> other.end_idx
19
+ else
20
+ retval
21
+ end
22
+ end
23
+
24
+ def overlap?(other)
25
+ if (start_idx == other.start_idx || end_idx == other.end_idx)
26
+ return true
27
+ elsif (start_idx < other.start_idx)
28
+ return end_idx > other.start_idx
29
+ elsif (start_idx > other.start_idx)
30
+ return other.end_idx > start_idx
31
+ end
32
+ end
33
+
34
+ def length
35
+ end_idx - start_idx
36
+ end
37
+ end
data/lib/profile.rb ADDED
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
4
+
5
+ require 'keyword_prospector'
6
+
7
+ class Profiler
8
+ MATCHWORD_COUNT=2000
9
+
10
+ def self.get_tree(keyword_count)
11
+ words = get_random_words(keyword_count)
12
+ KeywordProspector.new(words)
13
+ end
14
+
15
+ def self.profile_tree_creation(keyword_count, options={})
16
+ total = 0
17
+ time_limit = options[:time_limit] || 0.01
18
+
19
+ start_time = Time.now
20
+ iterations = 0
21
+ while Time.now - start_time < time_limit
22
+ start=Time.now
23
+ get_tree(keyword_count)
24
+ total += Time.now - start
25
+ iterations += 1
26
+ end
27
+
28
+ puts "Average tree creation time over #{iterations} iterations with #{keyword_count} keywords: #{total/iterations.to_f}"
29
+ end
30
+
31
+ def self.profile_keyword_matching(keyword_count, options={})
32
+ dl = get_tree(keyword_count)
33
+ time_limit = options[:time_limit] || 300
34
+
35
+ total = 0
36
+ total_matches = 0
37
+ start_time = Time.now
38
+ iterations = 0
39
+ words = get_random_words(options[:count] || MATCHWORD_COUNT)
40
+ text = words.join(" ")
41
+ while Time.now - start_time < time_limit
42
+ matches = 0
43
+ start = Time.now
44
+ dl.process(text) {matches += 1}
45
+ total += Time.now - start
46
+ total_matches += matches
47
+ iterations += 1
48
+ end
49
+
50
+ puts "Average time spent matching: #{total/iterations.to_f}"
51
+ puts "Average number of matches: #{total_matches/iterations.to_f}"
52
+ end
53
+
54
+ private
55
+ def self.get_random_words(count=1000)
56
+ words = []
57
+ dict = File.open("/usr/share/dict/words")
58
+ count.times do |index|
59
+ words[index] = dict.readline.chomp
60
+ end
61
+
62
+ dict.each_line do |word|
63
+ word = dict.readline.chomp
64
+ index = rand dict.lineno
65
+ if (index < count)
66
+ words[index] = word
67
+ end
68
+ end
69
+
70
+ return words
71
+ end
72
+ end
@@ -0,0 +1,45 @@
1
+ #
2
+ # (C) 2008 Los Angeles Times
3
+ #
4
+ require 'keyword_prospector'
5
+
6
+ class SearchAndReplace
7
+ def initialize
8
+ @dl = KeywordProspector.new
9
+ @tree_initialized = false
10
+ end
11
+
12
+ def add_replacement(original, replacement)
13
+ @dl.add(original, replacement)
14
+ end
15
+
16
+ def initialize_tree
17
+ unless @tree_initialized
18
+ @dl.construct_fail
19
+ @tree_initialized = true
20
+ end
21
+ end
22
+
23
+ def replace_text(text)
24
+ unless @tree_initialized
25
+ initialize_tree
26
+ end
27
+
28
+ matches = @dl.process(text.downcase, :filter_overlaps => true)
29
+
30
+ retval = ""
31
+ current_index = 0
32
+ matches.each do |match|
33
+ if (current_index < match.start_idx)
34
+ retval += text[current_index, match.start_idx - current_index]
35
+ end
36
+
37
+ retval += match.output
38
+ current_index = match.end_idx
39
+ end
40
+
41
+ retval += text[current_index, text.length] unless current_index == text.length
42
+
43
+ retval
44
+ end
45
+ end
data/lib/state.rb ADDED
@@ -0,0 +1,85 @@
1
+ #
2
+ # (C) 2008 Los Angeles Times
3
+ #
4
+
5
+ class State
6
+ @@id_sequence = 1
7
+
8
+ attr_reader :id
9
+ attr_reader :char
10
+
11
+ attr_accessor :fail
12
+ attr_accessor :fail_increment
13
+ attr_accessor :output
14
+ attr_accessor :depth
15
+ attr_accessor :keyword
16
+
17
+ def initialize(char, depth=0, next_state={})
18
+ if next_state.is_a?(Array)
19
+ class << next_state
20
+ def values
21
+ self.select{|value| value}
22
+ end
23
+
24
+ def keys
25
+ retval = []
26
+ self.each_index{|i| retval << i if self[i]}
27
+
28
+ retval
29
+ end
30
+ end
31
+ end
32
+
33
+ @char = char
34
+ @id = @@id_sequence
35
+ @next_state = next_state
36
+ @@id_sequence += 1
37
+ self.depth = depth
38
+ end
39
+
40
+ def insert_next_state(char)
41
+ cur_state = @next_state[char]
42
+
43
+ if cur_state
44
+ cur_state
45
+ else
46
+ @next_state[char] = State.new(char, depth + 1)
47
+ end
48
+ end
49
+
50
+ def [](char)
51
+ @next_state[char]
52
+ end
53
+
54
+ def []=(char, value)
55
+ @next_state[char] = value
56
+ end
57
+
58
+ def values
59
+ @next_state.values
60
+ end
61
+
62
+ def keys
63
+ @next_state.keys
64
+ end
65
+
66
+ def transition(char, position=nil)
67
+ next_state = @next_state[char]
68
+
69
+ if next_state == self
70
+ if position
71
+ position.end = position.begin += 1
72
+ end
73
+ elsif next_state
74
+ position.end += 1 if position
75
+ else
76
+ next_state = fail.transition(char)
77
+ if position
78
+ position.begin += depth - (next_state.depth - 1)
79
+ position.end += 1
80
+ end
81
+ end
82
+
83
+ next_state
84
+ end
85
+ end
data/script/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/keyword_prospector.rb'}"
9
+ puts "Loading keyword_prospector gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)
data/script/txt2html ADDED
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ GEM_NAME = 'keyword_prospector' # what ppl will type to install your gem
4
+ RUBYFORGE_PROJECT = 'keyword_prospector'
5
+
6
+ require 'rubygems'
7
+ begin
8
+ require 'newgem'
9
+ require 'rubyforge'
10
+ rescue LoadError
11
+ puts "\n\nGenerating the website requires the newgem RubyGem"
12
+ puts "Install: gem install newgem\n\n"
13
+ exit(1)
14
+ end
15
+ require 'redcloth'
16
+ require 'syntax/convertors/html'
17
+ require 'erb'
18
+ require File.dirname(__FILE__) + "/../lib/#{GEM_NAME}/version.rb"
19
+
20
+ version = KeywordProspector::VERSION::STRING
21
+ download = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
22
+
23
+ def rubyforge_project_id
24
+ RubyForge.new.autoconfig["group_ids"][RUBYFORGE_PROJECT]
25
+ end
26
+
27
+ class Fixnum
28
+ def ordinal
29
+ # teens
30
+ return 'th' if (10..19).include?(self % 100)
31
+ # others
32
+ case self % 10
33
+ when 1: return 'st'
34
+ when 2: return 'nd'
35
+ when 3: return 'rd'
36
+ else return 'th'
37
+ end
38
+ end
39
+ end
40
+
41
+ class Time
42
+ def pretty
43
+ return "#{mday}#{mday.ordinal} #{strftime('%B')} #{year}"
44
+ end
45
+ end
46
+
47
+ def convert_syntax(syntax, source)
48
+ return Syntax::Convertors::HTML.for_syntax(syntax).convert(source).gsub(%r!^<pre>|</pre>$!,'')
49
+ end
50
+
51
+ if ARGV.length >= 1
52
+ src, template = ARGV
53
+ template ||= File.join(File.dirname(__FILE__), '/../website/template.html.erb')
54
+ else
55
+ puts("Usage: #{File.split($0).last} source.txt [template.html.erb] > output.html")
56
+ exit!
57
+ end
58
+
59
+ template = ERB.new(File.open(template).read)
60
+
61
+ title = nil
62
+ body = nil
63
+ File.open(src) do |fsrc|
64
+ title_text = fsrc.readline
65
+ body_text_template = fsrc.read
66
+ body_text = ERB.new(body_text_template).result(binding)
67
+ syntax_items = []
68
+ body_text.gsub!(%r!<(pre|code)[^>]*?syntax=['"]([^'"]+)[^>]*>(.*?)</\1>!m){
69
+ ident = syntax_items.length
70
+ element, syntax, source = $1, $2, $3
71
+ syntax_items << "<#{element} class='syntax'>#{convert_syntax(syntax, source)}</#{element}>"
72
+ "syntax-temp-#{ident}"
73
+ }
74
+ title = RedCloth.new(title_text).to_html.gsub(%r!<.*?>!,'').strip
75
+ body = RedCloth.new(body_text).to_html
76
+ body.gsub!(%r!(?:<pre><code>)?syntax-temp-(\d+)(?:</code></pre>)?!){ syntax_items[$1.to_i] }
77
+ end
78
+ stat = File.stat(src)
79
+ created = stat.ctime
80
+ modified = stat.mtime
81
+
82
+ $stdout << template.result(binding)