ruletagger 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/COPYING +21 -0
  2. data/History.txt +4 -0
  3. data/LICENSE +21 -0
  4. data/License.txt +20 -0
  5. data/Manifest.txt +75 -0
  6. data/PostInstall.txt +7 -0
  7. data/README +7 -0
  8. data/README.txt +53 -0
  9. data/Rakefile +33 -0
  10. data/config/hoe.rb +75 -0
  11. data/config/requirements.rb +15 -0
  12. data/ext/rule_tagger/bool.h +38 -0
  13. data/ext/rule_tagger/darray.c +292 -0
  14. data/ext/rule_tagger/darray.h +125 -0
  15. data/ext/rule_tagger/darrayP.h +50 -0
  16. data/ext/rule_tagger/extconf.rb +14 -0
  17. data/ext/rule_tagger/lex.c +170 -0
  18. data/ext/rule_tagger/lex.h +49 -0
  19. data/ext/rule_tagger/memory.c +127 -0
  20. data/ext/rule_tagger/memory.h +20 -0
  21. data/ext/rule_tagger/rbtagger.c +252 -0
  22. data/ext/rule_tagger/registry.c +326 -0
  23. data/ext/rule_tagger/registry.h +129 -0
  24. data/ext/rule_tagger/registryP.h +46 -0
  25. data/ext/rule_tagger/ruby-compat.h +20 -0
  26. data/ext/rule_tagger/rules.c +525 -0
  27. data/ext/rule_tagger/rules.h +42 -0
  28. data/ext/rule_tagger/sysdep.h +20 -0
  29. data/ext/rule_tagger/tagger.c +110 -0
  30. data/ext/rule_tagger/tagger.h +46 -0
  31. data/ext/rule_tagger/useful.c +44 -0
  32. data/ext/rule_tagger/useful.h +51 -0
  33. data/ext/word_tagger/extconf.rb +7 -0
  34. data/ext/word_tagger/porter_stemmer.c +430 -0
  35. data/ext/word_tagger/porter_stemmer.h +19 -0
  36. data/ext/word_tagger/rtagger.cc +83 -0
  37. data/ext/word_tagger/tagger.cc +153 -0
  38. data/ext/word_tagger/tagger.h +27 -0
  39. data/ext/word_tagger/tagger.rb +8 -0
  40. data/ext/word_tagger/test/Makefile +22 -0
  41. data/ext/word_tagger/test/doc.txt +87 -0
  42. data/ext/word_tagger/test/test.cc +107 -0
  43. data/ext/word_tagger/test.rb +31 -0
  44. data/lib/brill/tagger.rb +225 -0
  45. data/lib/rbtagger/version.rb +9 -0
  46. data/lib/rbtagger.rb +6 -0
  47. data/script/console +10 -0
  48. data/script/destroy +14 -0
  49. data/script/generate +14 -0
  50. data/script/txt2html +82 -0
  51. data/setup.rb +1585 -0
  52. data/tasks/deployment.rake +34 -0
  53. data/tasks/environment.rake +7 -0
  54. data/tasks/website.rake +17 -0
  55. data/test/CONTEXTUALRULEFILE +284 -0
  56. data/test/LEXICALRULEFILE +148 -0
  57. data/test/LEXICON +93696 -0
  58. data/test/docs/doc0.txt +20 -0
  59. data/test/docs/doc1.txt +11 -0
  60. data/test/docs/doc2.txt +52 -0
  61. data/test/docs/doc3.txt +128 -0
  62. data/test/docs/doc4.txt +337 -0
  63. data/test/docs/doc5.txt +497 -0
  64. data/test/docs/doc6.txt +116 -0
  65. data/test/docs/doc7.txt +101 -0
  66. data/test/docs/doc8.txt +25 -0
  67. data/test/docs/doc9.txt +84 -0
  68. data/test/tagger_test.rb +60 -0
  69. data/test/test_helper.rb +2 -0
  70. data/tools/rakehelp.rb +113 -0
  71. data/website/index.html +113 -0
  72. data/website/index.txt +53 -0
  73. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  74. data/website/stylesheets/screen.css +138 -0
  75. data/website/template.html.erb +48 -0
  76. metadata +155 -0
@@ -0,0 +1,225 @@
1
+ require 'rule_tagger'
2
+
3
+ module Brill
4
+ class Tagger
5
+ def initialize( lexicon, lexical_rules, contextual_rules )
6
+ @tagger = ::Tagger::BrillTagger.new
7
+ Brill::Tagger.load_lexicon(@tagger,lexicon)
8
+ Brill::Tagger.load_lexical_rules(@tagger,lexical_rules)
9
+ Brill::Tagger.load_contextual_rules(@tagger,contextual_rules)
10
+ end
11
+
12
+ # Tag a body of text
13
+ # returns an array like [[token,tag],[token,tag]...[token,tag]]
14
+ #
15
+ def tag( text )
16
+ tokens = Brill::Tagger.tokenize( text )
17
+ tags = Brill::Tagger.tag_start( tokens )
18
+
19
+ @tagger.apply_lexical_rules( tokens, tags, [], 0 )
20
+ @tagger.default_tag_finish( tokens, tags )
21
+
22
+ # Brill uses these fake "STAART" tags to delimit the start & end of sentence.
23
+ tokens << "STAART"
24
+ tokens << "STAART"
25
+ tokens.unshift "STAART"
26
+ tokens.unshift "STAART"
27
+ tags << "STAART"
28
+ tags << "STAART"
29
+ tags.unshift "STAART"
30
+ tags.unshift "STAART"
31
+
32
+ @tagger.apply_contextual_rules( tokens, tags, 1 )
33
+
34
+ tags.shift
35
+ tags.shift
36
+ tokens.shift
37
+ tokens.shift
38
+ tags.pop
39
+ tags.pop
40
+ tokens.pop
41
+ tokens.pop
42
+
43
+ pairs = []
44
+ tokens.each_with_index do|t,i|
45
+ pairs << [t,tags[i]]
46
+ end
47
+ pairs
48
+ end
49
+ private
50
+ def self.lines( file )
51
+ lines = []
52
+ File.open(file,'r') do|f|
53
+ lines = f.readlines
54
+ end
55
+ lines
56
+ end
57
+ # load LEXICON
58
+ def self.load_lexicon(tagger,lexicon)
59
+ lines = Brill::Tagger.lines(lexicon)
60
+ i = 0
61
+ count = lines.size
62
+ while i < count
63
+ line = lines[i]
64
+ #puts "line: #{line.inspect}:#{i.inspect}"
65
+ parts = line.split(/\s/)
66
+ #puts "word: #{word.inspect}, tags: #{tags.inspect}"
67
+ word = parts.first
68
+ tags = parts[1..-1]
69
+ tagger.add_to_lexicon(word,tags.first)
70
+ #puts "#{word} => #{tags.inspect}"
71
+ tags.each do|tag|
72
+ tagger.add_to_lexicon_tags("#{word} #{tag}")
73
+ end
74
+ i += 1
75
+ end
76
+ end
77
+
78
+ # load LEXICALRULEFILE
79
+ def self.load_lexical_rules(tagger,rules)
80
+ lines = self.lines(rules)
81
+ i = 0
82
+ count = lines.size
83
+ =begin
84
+ # original perl
85
+ chomp;
86
+ my @line = split or next;
87
+ $self->_add_lexical_rule($_);
88
+
89
+ if ($line[1] eq 'goodright') {
90
+ $self->_add_goodright($line[0]);
91
+ } elsif ($line[2] eq 'fgoodright') {
92
+ $self->_add_goodright($line[1]);
93
+ } elsif ($line[1] eq 'goodleft') {
94
+ $self->_add_goodleft($line[0]);
95
+ } elsif ($line[2] eq 'fgoodleft') {
96
+ $self->_add_goodleft($line[1]);
97
+ }
98
+ =end
99
+ while i < count
100
+ line = lines[i].chomp
101
+ cols = line.split(/\s/)
102
+ next unless line.size > 0
103
+ tagger.add_lexical_rule(line)
104
+ if cols[1] == 'goodright'
105
+ tagger.add_goodright(cols[0])
106
+ elsif cols[2] == 'fgoodright'
107
+ tagger.add_goodright(cols[1])
108
+ elsif cols[1] == 'goodleft'
109
+ tagger.add_goodleft(cols[0])
110
+ elsif cols[2] == 'fgoodleft'
111
+ tagger.add_goodleft(cols[1])
112
+ end
113
+
114
+ i += 1
115
+ end
116
+ end
117
+
118
+ # load CONTEXTUALRULEFILE
119
+ def self.load_contextual_rules(tagger,rules)
120
+ lines = self.lines(rules)
121
+ i = 0
122
+ count = lines.size
123
+ while i < count
124
+ line = lines[i].chomp
125
+ next unless line.size > 0
126
+ tagger.add_contextual_rule(line);
127
+ i += 1
128
+ end
129
+ end
130
+
131
+ def self.tag_start(tokens)
132
+ tokens.map{|token| token.match(/^[A-Z]/) ? 'NNP' : 'NN' }
133
+ end
134
+
135
+ # this tokenize code is a port from perl
136
+ def self.tokenize(text)
137
+ # Normalize all whitespace
138
+ text = text.gsub(/\s+/,' ')
139
+
140
+ # translate some common extended ascii characters to quotes
141
+ text.gsub!(/#{145.chr}/,'`')
142
+ text.gsub!(/#{146.chr}/,"'")
143
+ text.gsub!(/#{147.chr}/,"``")
144
+ text.gsub!(/#{148.chr}/,"''")
145
+
146
+ # Attempt to get correct directional quotes
147
+ # s{\"\b} { `` }g;
148
+ text.gsub!(/\"\b/,' `` ')
149
+ # s{\b\"} { '' }g;
150
+ text.gsub!(/\b\"/," '' ")
151
+ #s{\"(?=\s)} { '' }g;
152
+ text.gsub!(/\"(?=\s)/," '' ")
153
+ #s{\"} { `` }g;
154
+ text.gsub!(/\"(?=\s)/," `` ")
155
+
156
+ # Isolate ellipses
157
+ # s{\.\.\.} { ... }g;
158
+ text.gsub!(/\.\.\./,' ... ')
159
+
160
+
161
+ # Isolate any embedded punctuation chars
162
+ # s{([,;:\@\#\$\%&])} { $1 }g;
163
+ text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
164
+
165
+ # Assume sentence tokenization has been done first, so split FINAL
166
+ # periods only.
167
+ # s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
168
+ text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
169
+
170
+ # however, we may as well split ALL question marks and exclamation points,
171
+ # since they shouldn't have the abbrev.-marker ambiguity problem
172
+ #s{([?!])} { $1 }g;
173
+ text.gsub!(/([?!])/, ' \1 ')
174
+
175
+ # parentheses, brackets, etc.
176
+ #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
177
+ text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
178
+
179
+ #s/(-{2,})/ $1 /g;
180
+ text.gsub!(/(-{2,})/,' \1 ')
181
+
182
+ # Add a space to the beginning and end of each line, to reduce
183
+ # necessary number of regexps below.
184
+ #s/$/ /;
185
+ text.gsub!(/$/," ")
186
+ #s/^/ /;
187
+ text.gsub!(/^/," ")
188
+
189
+ # possessive or close-single-quote
190
+ #s/\([^\']\)\' /$1 \' /g;
191
+ text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
192
+
193
+ # as in it's, I'm, we'd
194
+ #s/\'([smd]) / \'$1 /ig;
195
+ text.gsub!(/\'([smd]) /i,%q( '\1 ))
196
+
197
+ #s/\'(ll|re|ve) / \'$1 /ig;
198
+ text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
199
+ #s/n\'t / n\'t /ig;
200
+ text.gsub!(/n\'t /i," n't ")
201
+
202
+ #s/ (can)(not) / $1 $2 /ig;
203
+ text.gsub!(/ (can)(not) /i,' \1 \2 ')
204
+ #s/ (d\')(ye) / $1 $2 /ig;
205
+ text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
206
+ #s/ (gim)(me) / $1 $2 /ig;
207
+ text.gsub!(/ (gim)(me) /i,' \1 \2 ')
208
+ #s/ (gon)(na) / $1 $2 /ig;
209
+ text.gsub!(/ (gon)(na) /i,' \1 \2 ')
210
+ #s/ (got)(ta) / $1 $2 /ig;
211
+ text.gsub!(/ (got)(ta) /i,' \1 \2 ')
212
+ #s/ (lem)(me) / $1 $2 /ig;
213
+ text.gsub!(/ (lem)(me) /i,' \1 \2 ')
214
+ #s/ (more)(\'n) / $1 $2 /ig;
215
+ text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
216
+ #s/ (\'t)(is|was) / $1 $2 /ig;
217
+ text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
218
+ #s/ (wan)(na) / $1 $2 /ig;
219
+ text.gsub!(/ (wan)(na) /i,' \1 \2 ')
220
+
221
+ text.split(/\s/)
222
+ end
223
+
224
+ end
225
+ end
@@ -0,0 +1,9 @@
1
+ module RbTagger #:nodoc:
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 1
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
data/lib/rbtagger.rb ADDED
@@ -0,0 +1,6 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ module RbTagger
5
+
6
+ end
data/script/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/rbtagger.rb'}"
9
+ puts "Loading rb-brill-tagger gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)
data/script/txt2html ADDED
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ GEM_NAME = 'rbtagger' # what ppl will type to install your gem
4
+ RUBYFORGE_PROJECT = 'ruletagger'
5
+
6
+ require 'rubygems'
7
+ begin
8
+ require 'newgem'
9
+ require 'rubyforge'
10
+ rescue LoadError
11
+ puts "\n\nGenerating the website requires the newgem RubyGem"
12
+ puts "Install: gem install newgem\n\n"
13
+ exit(1)
14
+ end
15
+ require 'redcloth'
16
+ require 'syntax/convertors/html'
17
+ require 'erb'
18
+ require File.dirname(__FILE__) + "/../lib/#{GEM_NAME}/version.rb"
19
+
20
+ version = RbTagger::VERSION::STRING
21
+ download = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
22
+
23
+ def rubyforge_project_id
24
+ RubyForge.new.autoconfig["group_ids"][RUBYFORGE_PROJECT]
25
+ end
26
+
27
+ class Fixnum
28
+ def ordinal
29
+ # teens
30
+ return 'th' if (10..19).include?(self % 100)
31
+ # others
32
+ case self % 10
33
+ when 1: return 'st'
34
+ when 2: return 'nd'
35
+ when 3: return 'rd'
36
+ else return 'th'
37
+ end
38
+ end
39
+ end
40
+
41
+ class Time
42
+ def pretty
43
+ return "#{mday}#{mday.ordinal} #{strftime('%B')} #{year}"
44
+ end
45
+ end
46
+
47
+ def convert_syntax(syntax, source)
48
+ return Syntax::Convertors::HTML.for_syntax(syntax).convert(source).gsub(%r!^<pre>|</pre>$!,'')
49
+ end
50
+
51
+ if ARGV.length >= 1
52
+ src, template = ARGV
53
+ template ||= File.join(File.dirname(__FILE__), '/../website/template.html.erb')
54
+ else
55
+ puts("Usage: #{File.split($0).last} source.txt [template.html.erb] > output.html")
56
+ exit!
57
+ end
58
+
59
+ template = ERB.new(File.open(template).read)
60
+
61
+ title = nil
62
+ body = nil
63
+ File.open(src) do |fsrc|
64
+ title_text = fsrc.readline
65
+ body_text_template = fsrc.read
66
+ body_text = ERB.new(body_text_template).result(binding)
67
+ syntax_items = []
68
+ body_text.gsub!(%r!<(pre|code)[^>]*?syntax=['"]([^'"]+)[^>]*>(.*?)</\1>!m){
69
+ ident = syntax_items.length
70
+ element, syntax, source = $1, $2, $3
71
+ syntax_items << "<#{element} class='syntax'>#{convert_syntax(syntax, source)}</#{element}>"
72
+ "syntax-temp-#{ident}"
73
+ }
74
+ title = RedCloth.new(title_text).to_html.gsub(%r!<.*?>!,'').strip
75
+ body = RedCloth.new(body_text).to_html
76
+ body.gsub!(%r!(?:<pre><code>)?syntax-temp-(\d+)(?:</code></pre>)?!){ syntax_items[$1.to_i] }
77
+ end
78
+ stat = File.stat(src)
79
+ created = stat.ctime
80
+ modified = stat.mtime
81
+
82
+ $stdout << template.result(binding)