rbtagger 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/COPYING +21 -0
  2. data/History.txt +4 -0
  3. data/LICENSE +21 -0
  4. data/License.txt +20 -0
  5. data/Manifest.txt +75 -0
  6. data/PostInstall.txt +7 -0
  7. data/README +7 -0
  8. data/README.txt +53 -0
  9. data/Rakefile +33 -0
  10. data/config/hoe.rb +74 -0
  11. data/config/requirements.rb +15 -0
  12. data/ext/rule_tagger/bool.h +38 -0
  13. data/ext/rule_tagger/darray.c +292 -0
  14. data/ext/rule_tagger/darray.h +125 -0
  15. data/ext/rule_tagger/darrayP.h +50 -0
  16. data/ext/rule_tagger/extconf.rb +14 -0
  17. data/ext/rule_tagger/lex.c +170 -0
  18. data/ext/rule_tagger/lex.h +49 -0
  19. data/ext/rule_tagger/memory.c +127 -0
  20. data/ext/rule_tagger/memory.h +20 -0
  21. data/ext/rule_tagger/rbtagger.c +252 -0
  22. data/ext/rule_tagger/registry.c +326 -0
  23. data/ext/rule_tagger/registry.h +129 -0
  24. data/ext/rule_tagger/registryP.h +46 -0
  25. data/ext/rule_tagger/ruby-compat.h +20 -0
  26. data/ext/rule_tagger/rules.c +525 -0
  27. data/ext/rule_tagger/rules.h +42 -0
  28. data/ext/rule_tagger/sysdep.h +20 -0
  29. data/ext/rule_tagger/tagger.c +110 -0
  30. data/ext/rule_tagger/tagger.h +46 -0
  31. data/ext/rule_tagger/useful.c +44 -0
  32. data/ext/rule_tagger/useful.h +51 -0
  33. data/ext/word_tagger/extconf.rb +7 -0
  34. data/ext/word_tagger/porter_stemmer.c +430 -0
  35. data/ext/word_tagger/porter_stemmer.h +19 -0
  36. data/ext/word_tagger/rtagger.cc +83 -0
  37. data/ext/word_tagger/tagger.cc +153 -0
  38. data/ext/word_tagger/tagger.h +27 -0
  39. data/ext/word_tagger/tagger.rb +8 -0
  40. data/ext/word_tagger/test/Makefile +22 -0
  41. data/ext/word_tagger/test/doc.txt +87 -0
  42. data/ext/word_tagger/test/test.cc +107 -0
  43. data/ext/word_tagger/test.rb +31 -0
  44. data/lib/brill/tagger.rb +225 -0
  45. data/lib/rbtagger/version.rb +9 -0
  46. data/lib/rbtagger.rb +6 -0
  47. data/script/console +10 -0
  48. data/script/destroy +14 -0
  49. data/script/generate +14 -0
  50. data/script/txt2html +82 -0
  51. data/setup.rb +1585 -0
  52. data/tasks/deployment.rake +34 -0
  53. data/tasks/environment.rake +7 -0
  54. data/tasks/website.rake +17 -0
  55. data/test/CONTEXTUALRULEFILE +284 -0
  56. data/test/LEXICALRULEFILE +148 -0
  57. data/test/LEXICON +93696 -0
  58. data/test/docs/doc0.txt +20 -0
  59. data/test/docs/doc1.txt +11 -0
  60. data/test/docs/doc2.txt +52 -0
  61. data/test/docs/doc3.txt +128 -0
  62. data/test/docs/doc4.txt +337 -0
  63. data/test/docs/doc5.txt +497 -0
  64. data/test/docs/doc6.txt +116 -0
  65. data/test/docs/doc7.txt +101 -0
  66. data/test/docs/doc8.txt +25 -0
  67. data/test/docs/doc9.txt +84 -0
  68. data/test/tagger_test.rb +60 -0
  69. data/test/test_helper.rb +2 -0
  70. data/tools/rakehelp.rb +113 -0
  71. data/website/index.html +113 -0
  72. data/website/index.txt +53 -0
  73. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  74. data/website/stylesheets/screen.css +138 -0
  75. data/website/template.html.erb +48 -0
  76. metadata +155 -0
@@ -0,0 +1,225 @@
1
+ require 'rule_tagger'
2
+
3
+ module Brill
4
+ class Tagger
5
+ def initialize( lexicon, lexical_rules, contextual_rules )
6
+ @tagger = ::Tagger::BrillTagger.new
7
+ Brill::Tagger.load_lexicon(@tagger,lexicon)
8
+ Brill::Tagger.load_lexical_rules(@tagger,lexical_rules)
9
+ Brill::Tagger.load_contextual_rules(@tagger,contextual_rules)
10
+ end
11
+
12
+ # Tag a body of text
13
+ # returns an array like [[token,tag],[token,tag]...[token,tag]]
14
+ #
15
+ def tag( text )
16
+ tokens = Brill::Tagger.tokenize( text )
17
+ tags = Brill::Tagger.tag_start( tokens )
18
+
19
+ @tagger.apply_lexical_rules( tokens, tags, [], 0 )
20
+ @tagger.default_tag_finish( tokens, tags )
21
+
22
+ # Brill uses these fake "STAART" tags to delimit the start & end of sentence.
23
+ tokens << "STAART"
24
+ tokens << "STAART"
25
+ tokens.unshift "STAART"
26
+ tokens.unshift "STAART"
27
+ tags << "STAART"
28
+ tags << "STAART"
29
+ tags.unshift "STAART"
30
+ tags.unshift "STAART"
31
+
32
+ @tagger.apply_contextual_rules( tokens, tags, 1 )
33
+
34
+ tags.shift
35
+ tags.shift
36
+ tokens.shift
37
+ tokens.shift
38
+ tags.pop
39
+ tags.pop
40
+ tokens.pop
41
+ tokens.pop
42
+
43
+ pairs = []
44
+ tokens.each_with_index do|t,i|
45
+ pairs << [t,tags[i]]
46
+ end
47
+ pairs
48
+ end
49
+ private
50
+ def self.lines( file )
51
+ lines = []
52
+ File.open(file,'r') do|f|
53
+ lines = f.readlines
54
+ end
55
+ lines
56
+ end
57
+ # load LEXICON
58
+ def self.load_lexicon(tagger,lexicon)
59
+ lines = Brill::Tagger.lines(lexicon)
60
+ i = 0
61
+ count = lines.size
62
+ while i < count
63
+ line = lines[i]
64
+ #puts "line: #{line.inspect}:#{i.inspect}"
65
+ parts = line.split(/\s/)
66
+ #puts "word: #{word.inspect}, tags: #{tags.inspect}"
67
+ word = parts.first
68
+ tags = parts[1..-1]
69
+ tagger.add_to_lexicon(word,tags.first)
70
+ #puts "#{word} => #{tags.inspect}"
71
+ tags.each do|tag|
72
+ tagger.add_to_lexicon_tags("#{word} #{tag}")
73
+ end
74
+ i += 1
75
+ end
76
+ end
77
+
78
+ # load LEXICALRULEFILE
79
+ def self.load_lexical_rules(tagger,rules)
80
+ lines = self.lines(rules)
81
+ i = 0
82
+ count = lines.size
83
+ =begin
84
+ # original perl
85
+ chomp;
86
+ my @line = split or next;
87
+ $self->_add_lexical_rule($_);
88
+
89
+ if ($line[1] eq 'goodright') {
90
+ $self->_add_goodright($line[0]);
91
+ } elsif ($line[2] eq 'fgoodright') {
92
+ $self->_add_goodright($line[1]);
93
+ } elsif ($line[1] eq 'goodleft') {
94
+ $self->_add_goodleft($line[0]);
95
+ } elsif ($line[2] eq 'fgoodleft') {
96
+ $self->_add_goodleft($line[1]);
97
+ }
98
+ =end
99
+ while i < count
100
+ line = lines[i].chomp
101
+ cols = line.split(/\s/)
102
+ next unless line.size > 0
103
+ tagger.add_lexical_rule(line)
104
+ if cols[1] == 'goodright'
105
+ tagger.add_goodright(cols[0])
106
+ elsif cols[2] == 'fgoodright'
107
+ tagger.add_goodright(cols[1])
108
+ elsif cols[1] == 'goodleft'
109
+ tagger.add_goodleft(cols[0])
110
+ elsif cols[2] == 'fgoodleft'
111
+ tagger.add_goodleft(cols[1])
112
+ end
113
+
114
+ i += 1
115
+ end
116
+ end
117
+
118
+ # load CONTEXTUALRULEFILE
119
+ def self.load_contextual_rules(tagger,rules)
120
+ lines = self.lines(rules)
121
+ i = 0
122
+ count = lines.size
123
+ while i < count
124
+ line = lines[i].chomp
125
+ next unless line.size > 0
126
+ tagger.add_contextual_rule(line);
127
+ i += 1
128
+ end
129
+ end
130
+
131
+ def self.tag_start(tokens)
132
+ tokens.map{|token| token.match(/^[A-Z]/) ? 'NNP' : 'NN' }
133
+ end
134
+
135
+ # this tokenize code is a port from perl
136
+ def self.tokenize(text)
137
+ # Normalize all whitespace
138
+ text = text.gsub(/\s+/,' ')
139
+
140
+ # translate some common extended ascii characters to quotes
141
+ text.gsub!(/#{145.chr}/,'`')
142
+ text.gsub!(/#{146.chr}/,"'")
143
+ text.gsub!(/#{147.chr}/,"``")
144
+ text.gsub!(/#{148.chr}/,"''")
145
+
146
+ # Attempt to get correct directional quotes
147
+ # s{\"\b} { `` }g;
148
+ text.gsub!(/\"\b/,' `` ')
149
+ # s{\b\"} { '' }g;
150
+ text.gsub!(/\b\"/," '' ")
151
+ #s{\"(?=\s)} { '' }g;
152
+ text.gsub!(/\"(?=\s)/," '' ")
153
+ #s{\"} { `` }g;
154
+ text.gsub!(/\"(?=\s)/," `` ")
155
+
156
+ # Isolate ellipses
157
+ # s{\.\.\.} { ... }g;
158
+ text.gsub!(/\.\.\./,' ... ')
159
+
160
+
161
+ # Isolate any embedded punctuation chars
162
+ # s{([,;:\@\#\$\%&])} { $1 }g;
163
+ text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
164
+
165
+ # Assume sentence tokenization has been done first, so split FINAL
166
+ # periods only.
167
+ # s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
168
+ text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
169
+
170
+ # however, we may as well split ALL question marks and exclamation points,
171
+ # since they shouldn't have the abbrev.-marker ambiguity problem
172
+ #s{([?!])} { $1 }g;
173
+ text.gsub!(/([?!])/, ' \1 ')
174
+
175
+ # parentheses, brackets, etc.
176
+ #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
177
+ text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
178
+
179
+ #s/(-{2,})/ $1 /g;
180
+ text.gsub!(/(-{2,})/,' \1 ')
181
+
182
+ # Add a space to the beginning and end of each line, to reduce
183
+ # necessary number of regexps below.
184
+ #s/$/ /;
185
+ text.gsub!(/$/," ")
186
+ #s/^/ /;
187
+ text.gsub!(/^/," ")
188
+
189
+ # possessive or close-single-quote
190
+ #s/\([^\']\)\' /$1 \' /g;
191
+ text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
192
+
193
+ # as in it's, I'm, we'd
194
+ #s/\'([smd]) / \'$1 /ig;
195
+ text.gsub!(/\'([smd]) /i,%q( '\1 ))
196
+
197
+ #s/\'(ll|re|ve) / \'$1 /ig;
198
+ text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
199
+ #s/n\'t / n\'t /ig;
200
+ text.gsub!(/n\'t /i," n't ")
201
+
202
+ #s/ (can)(not) / $1 $2 /ig;
203
+ text.gsub!(/ (can)(not) /i,' \1 \2 ')
204
+ #s/ (d\')(ye) / $1 $2 /ig;
205
+ text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
206
+ #s/ (gim)(me) / $1 $2 /ig;
207
+ text.gsub!(/ (gim)(me) /i,' \1 \2 ')
208
+ #s/ (gon)(na) / $1 $2 /ig;
209
+ text.gsub!(/ (gon)(na) /i,' \1 \2 ')
210
+ #s/ (got)(ta) / $1 $2 /ig;
211
+ text.gsub!(/ (got)(ta) /i,' \1 \2 ')
212
+ #s/ (lem)(me) / $1 $2 /ig;
213
+ text.gsub!(/ (lem)(me) /i,' \1 \2 ')
214
+ #s/ (more)(\'n) / $1 $2 /ig;
215
+ text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
216
+ #s/ (\'t)(is|was) / $1 $2 /ig;
217
+ text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
218
+ #s/ (wan)(na) / $1 $2 /ig;
219
+ text.gsub!(/ (wan)(na) /i,' \1 \2 ')
220
+
221
+ text.split(/\s/)
222
+ end
223
+
224
+ end
225
+ end
@@ -0,0 +1,9 @@
1
+ module RbTagger #:nodoc:
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 1
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
data/lib/rbtagger.rb ADDED
@@ -0,0 +1,6 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ module RbTagger
5
+
6
+ end
data/script/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/rbtagger.rb'}"
9
+ puts "Loading rb-brill-tagger gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)
data/script/txt2html ADDED
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ GEM_NAME = 'rbtagger' # what ppl will type to install your gem
4
+ RUBYFORGE_PROJECT = 'ruletagger'
5
+
6
+ require 'rubygems'
7
+ begin
8
+ require 'newgem'
9
+ require 'rubyforge'
10
+ rescue LoadError
11
+ puts "\n\nGenerating the website requires the newgem RubyGem"
12
+ puts "Install: gem install newgem\n\n"
13
+ exit(1)
14
+ end
15
+ require 'redcloth'
16
+ require 'syntax/convertors/html'
17
+ require 'erb'
18
+ require File.dirname(__FILE__) + "/../lib/#{GEM_NAME}/version.rb"
19
+
20
+ version = RbTagger::VERSION::STRING
21
+ download = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
22
+
23
+ def rubyforge_project_id
24
+ RubyForge.new.autoconfig["group_ids"][RUBYFORGE_PROJECT]
25
+ end
26
+
27
+ class Fixnum
28
+ def ordinal
29
+ # teens
30
+ return 'th' if (10..19).include?(self % 100)
31
+ # others
32
+ case self % 10
33
+ when 1: return 'st'
34
+ when 2: return 'nd'
35
+ when 3: return 'rd'
36
+ else return 'th'
37
+ end
38
+ end
39
+ end
40
+
41
+ class Time
42
+ def pretty
43
+ return "#{mday}#{mday.ordinal} #{strftime('%B')} #{year}"
44
+ end
45
+ end
46
+
47
+ def convert_syntax(syntax, source)
48
+ return Syntax::Convertors::HTML.for_syntax(syntax).convert(source).gsub(%r!^<pre>|</pre>$!,'')
49
+ end
50
+
51
+ if ARGV.length >= 1
52
+ src, template = ARGV
53
+ template ||= File.join(File.dirname(__FILE__), '/../website/template.html.erb')
54
+ else
55
+ puts("Usage: #{File.split($0).last} source.txt [template.html.erb] > output.html")
56
+ exit!
57
+ end
58
+
59
+ template = ERB.new(File.open(template).read)
60
+
61
+ title = nil
62
+ body = nil
63
+ File.open(src) do |fsrc|
64
+ title_text = fsrc.readline
65
+ body_text_template = fsrc.read
66
+ body_text = ERB.new(body_text_template).result(binding)
67
+ syntax_items = []
68
+ body_text.gsub!(%r!<(pre|code)[^>]*?syntax=['"]([^'"]+)[^>]*>(.*?)</\1>!m){
69
+ ident = syntax_items.length
70
+ element, syntax, source = $1, $2, $3
71
+ syntax_items << "<#{element} class='syntax'>#{convert_syntax(syntax, source)}</#{element}>"
72
+ "syntax-temp-#{ident}"
73
+ }
74
+ title = RedCloth.new(title_text).to_html.gsub(%r!<.*?>!,'').strip
75
+ body = RedCloth.new(body_text).to_html
76
+ body.gsub!(%r!(?:<pre><code>)?syntax-temp-(\d+)(?:</code></pre>)?!){ syntax_items[$1.to_i] }
77
+ end
78
+ stat = File.stat(src)
79
+ created = stat.ctime
80
+ modified = stat.mtime
81
+
82
+ $stdout << template.result(binding)