rbtagger 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +21 -0
- data/History.txt +4 -0
- data/LICENSE +21 -0
- data/License.txt +20 -0
- data/Manifest.txt +75 -0
- data/PostInstall.txt +7 -0
- data/README +7 -0
- data/README.txt +53 -0
- data/Rakefile +33 -0
- data/config/hoe.rb +74 -0
- data/config/requirements.rb +15 -0
- data/ext/rule_tagger/bool.h +38 -0
- data/ext/rule_tagger/darray.c +292 -0
- data/ext/rule_tagger/darray.h +125 -0
- data/ext/rule_tagger/darrayP.h +50 -0
- data/ext/rule_tagger/extconf.rb +14 -0
- data/ext/rule_tagger/lex.c +170 -0
- data/ext/rule_tagger/lex.h +49 -0
- data/ext/rule_tagger/memory.c +127 -0
- data/ext/rule_tagger/memory.h +20 -0
- data/ext/rule_tagger/rbtagger.c +252 -0
- data/ext/rule_tagger/registry.c +326 -0
- data/ext/rule_tagger/registry.h +129 -0
- data/ext/rule_tagger/registryP.h +46 -0
- data/ext/rule_tagger/ruby-compat.h +20 -0
- data/ext/rule_tagger/rules.c +525 -0
- data/ext/rule_tagger/rules.h +42 -0
- data/ext/rule_tagger/sysdep.h +20 -0
- data/ext/rule_tagger/tagger.c +110 -0
- data/ext/rule_tagger/tagger.h +46 -0
- data/ext/rule_tagger/useful.c +44 -0
- data/ext/rule_tagger/useful.h +51 -0
- data/ext/word_tagger/extconf.rb +7 -0
- data/ext/word_tagger/porter_stemmer.c +430 -0
- data/ext/word_tagger/porter_stemmer.h +19 -0
- data/ext/word_tagger/rtagger.cc +83 -0
- data/ext/word_tagger/tagger.cc +153 -0
- data/ext/word_tagger/tagger.h +27 -0
- data/ext/word_tagger/tagger.rb +8 -0
- data/ext/word_tagger/test/Makefile +22 -0
- data/ext/word_tagger/test/doc.txt +87 -0
- data/ext/word_tagger/test/test.cc +107 -0
- data/ext/word_tagger/test.rb +31 -0
- data/lib/brill/tagger.rb +225 -0
- data/lib/rbtagger/version.rb +9 -0
- data/lib/rbtagger.rb +6 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/website.rake +17 -0
- data/test/CONTEXTUALRULEFILE +284 -0
- data/test/LEXICALRULEFILE +148 -0
- data/test/LEXICON +93696 -0
- data/test/docs/doc0.txt +20 -0
- data/test/docs/doc1.txt +11 -0
- data/test/docs/doc2.txt +52 -0
- data/test/docs/doc3.txt +128 -0
- data/test/docs/doc4.txt +337 -0
- data/test/docs/doc5.txt +497 -0
- data/test/docs/doc6.txt +116 -0
- data/test/docs/doc7.txt +101 -0
- data/test/docs/doc8.txt +25 -0
- data/test/docs/doc9.txt +84 -0
- data/test/tagger_test.rb +60 -0
- data/test/test_helper.rb +2 -0
- data/tools/rakehelp.rb +113 -0
- data/website/index.html +113 -0
- data/website/index.txt +53 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.html.erb +48 -0
- metadata +155 -0
data/lib/brill/tagger.rb
ADDED
@@ -0,0 +1,225 @@
|
|
1
|
+
require 'rule_tagger'
|
2
|
+
|
3
|
+
module Brill
|
4
|
+
class Tagger
|
5
|
+
def initialize( lexicon, lexical_rules, contextual_rules )
|
6
|
+
@tagger = ::Tagger::BrillTagger.new
|
7
|
+
Brill::Tagger.load_lexicon(@tagger,lexicon)
|
8
|
+
Brill::Tagger.load_lexical_rules(@tagger,lexical_rules)
|
9
|
+
Brill::Tagger.load_contextual_rules(@tagger,contextual_rules)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Tag a body of text
|
13
|
+
# returns an array like [[token,tag],[token,tag]...[token,tag]]
|
14
|
+
#
|
15
|
+
def tag( text )
|
16
|
+
tokens = Brill::Tagger.tokenize( text )
|
17
|
+
tags = Brill::Tagger.tag_start( tokens )
|
18
|
+
|
19
|
+
@tagger.apply_lexical_rules( tokens, tags, [], 0 )
|
20
|
+
@tagger.default_tag_finish( tokens, tags )
|
21
|
+
|
22
|
+
# Brill uses these fake "STAART" tags to delimit the start & end of sentence.
|
23
|
+
tokens << "STAART"
|
24
|
+
tokens << "STAART"
|
25
|
+
tokens.unshift "STAART"
|
26
|
+
tokens.unshift "STAART"
|
27
|
+
tags << "STAART"
|
28
|
+
tags << "STAART"
|
29
|
+
tags.unshift "STAART"
|
30
|
+
tags.unshift "STAART"
|
31
|
+
|
32
|
+
@tagger.apply_contextual_rules( tokens, tags, 1 )
|
33
|
+
|
34
|
+
tags.shift
|
35
|
+
tags.shift
|
36
|
+
tokens.shift
|
37
|
+
tokens.shift
|
38
|
+
tags.pop
|
39
|
+
tags.pop
|
40
|
+
tokens.pop
|
41
|
+
tokens.pop
|
42
|
+
|
43
|
+
pairs = []
|
44
|
+
tokens.each_with_index do|t,i|
|
45
|
+
pairs << [t,tags[i]]
|
46
|
+
end
|
47
|
+
pairs
|
48
|
+
end
|
49
|
+
private
|
50
|
+
def self.lines( file )
|
51
|
+
lines = []
|
52
|
+
File.open(file,'r') do|f|
|
53
|
+
lines = f.readlines
|
54
|
+
end
|
55
|
+
lines
|
56
|
+
end
|
57
|
+
# load LEXICON
|
58
|
+
def self.load_lexicon(tagger,lexicon)
|
59
|
+
lines = Brill::Tagger.lines(lexicon)
|
60
|
+
i = 0
|
61
|
+
count = lines.size
|
62
|
+
while i < count
|
63
|
+
line = lines[i]
|
64
|
+
#puts "line: #{line.inspect}:#{i.inspect}"
|
65
|
+
parts = line.split(/\s/)
|
66
|
+
#puts "word: #{word.inspect}, tags: #{tags.inspect}"
|
67
|
+
word = parts.first
|
68
|
+
tags = parts[1..-1]
|
69
|
+
tagger.add_to_lexicon(word,tags.first)
|
70
|
+
#puts "#{word} => #{tags.inspect}"
|
71
|
+
tags.each do|tag|
|
72
|
+
tagger.add_to_lexicon_tags("#{word} #{tag}")
|
73
|
+
end
|
74
|
+
i += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# load LEXICALRULEFILE
|
79
|
+
def self.load_lexical_rules(tagger,rules)
|
80
|
+
lines = self.lines(rules)
|
81
|
+
i = 0
|
82
|
+
count = lines.size
|
83
|
+
=begin
|
84
|
+
# original perl
|
85
|
+
chomp;
|
86
|
+
my @line = split or next;
|
87
|
+
$self->_add_lexical_rule($_);
|
88
|
+
|
89
|
+
if ($line[1] eq 'goodright') {
|
90
|
+
$self->_add_goodright($line[0]);
|
91
|
+
} elsif ($line[2] eq 'fgoodright') {
|
92
|
+
$self->_add_goodright($line[1]);
|
93
|
+
} elsif ($line[1] eq 'goodleft') {
|
94
|
+
$self->_add_goodleft($line[0]);
|
95
|
+
} elsif ($line[2] eq 'fgoodleft') {
|
96
|
+
$self->_add_goodleft($line[1]);
|
97
|
+
}
|
98
|
+
=end
|
99
|
+
while i < count
|
100
|
+
line = lines[i].chomp
|
101
|
+
cols = line.split(/\s/)
|
102
|
+
next unless line.size > 0
|
103
|
+
tagger.add_lexical_rule(line)
|
104
|
+
if cols[1] == 'goodright'
|
105
|
+
tagger.add_goodright(cols[0])
|
106
|
+
elsif cols[2] == 'fgoodright'
|
107
|
+
tagger.add_goodright(cols[1])
|
108
|
+
elsif cols[1] == 'goodleft'
|
109
|
+
tagger.add_goodleft(cols[0])
|
110
|
+
elsif cols[2] == 'fgoodleft'
|
111
|
+
tagger.add_goodleft(cols[1])
|
112
|
+
end
|
113
|
+
|
114
|
+
i += 1
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# load CONTEXTUALRULEFILE
|
119
|
+
def self.load_contextual_rules(tagger,rules)
|
120
|
+
lines = self.lines(rules)
|
121
|
+
i = 0
|
122
|
+
count = lines.size
|
123
|
+
while i < count
|
124
|
+
line = lines[i].chomp
|
125
|
+
next unless line.size > 0
|
126
|
+
tagger.add_contextual_rule(line);
|
127
|
+
i += 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.tag_start(tokens)
|
132
|
+
tokens.map{|token| token.match(/^[A-Z]/) ? 'NNP' : 'NN' }
|
133
|
+
end
|
134
|
+
|
135
|
+
# this tokenize code is a port from perl
|
136
|
+
def self.tokenize(text)
|
137
|
+
# Normalize all whitespace
|
138
|
+
text = text.gsub(/\s+/,' ')
|
139
|
+
|
140
|
+
# translate some common extended ascii characters to quotes
|
141
|
+
text.gsub!(/#{145.chr}/,'`')
|
142
|
+
text.gsub!(/#{146.chr}/,"'")
|
143
|
+
text.gsub!(/#{147.chr}/,"``")
|
144
|
+
text.gsub!(/#{148.chr}/,"''")
|
145
|
+
|
146
|
+
# Attempt to get correct directional quotes
|
147
|
+
# s{\"\b} { `` }g;
|
148
|
+
text.gsub!(/\"\b/,' `` ')
|
149
|
+
# s{\b\"} { '' }g;
|
150
|
+
text.gsub!(/\b\"/," '' ")
|
151
|
+
#s{\"(?=\s)} { '' }g;
|
152
|
+
text.gsub!(/\"(?=\s)/," '' ")
|
153
|
+
#s{\"} { `` }g;
|
154
|
+
text.gsub!(/\"(?=\s)/," `` ")
|
155
|
+
|
156
|
+
# Isolate ellipses
|
157
|
+
# s{\.\.\.} { ... }g;
|
158
|
+
text.gsub!(/\.\.\./,' ... ')
|
159
|
+
|
160
|
+
|
161
|
+
# Isolate any embedded punctuation chars
|
162
|
+
# s{([,;:\@\#\$\%&])} { $1 }g;
|
163
|
+
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
164
|
+
|
165
|
+
# Assume sentence tokenization has been done first, so split FINAL
|
166
|
+
# periods only.
|
167
|
+
# s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
|
168
|
+
text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
|
169
|
+
|
170
|
+
# however, we may as well split ALL question marks and exclamation points,
|
171
|
+
# since they shouldn't have the abbrev.-marker ambiguity problem
|
172
|
+
#s{([?!])} { $1 }g;
|
173
|
+
text.gsub!(/([?!])/, ' \1 ')
|
174
|
+
|
175
|
+
# parentheses, brackets, etc.
|
176
|
+
#s{([\]\[\(\)\{\}\<\>])} { $1 }g;
|
177
|
+
text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
|
178
|
+
|
179
|
+
#s/(-{2,})/ $1 /g;
|
180
|
+
text.gsub!(/(-{2,})/,' \1 ')
|
181
|
+
|
182
|
+
# Add a space to the beginning and end of each line, to reduce
|
183
|
+
# necessary number of regexps below.
|
184
|
+
#s/$/ /;
|
185
|
+
text.gsub!(/$/," ")
|
186
|
+
#s/^/ /;
|
187
|
+
text.gsub!(/^/," ")
|
188
|
+
|
189
|
+
# possessive or close-single-quote
|
190
|
+
#s/\([^\']\)\' /$1 \' /g;
|
191
|
+
text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
|
192
|
+
|
193
|
+
# as in it's, I'm, we'd
|
194
|
+
#s/\'([smd]) / \'$1 /ig;
|
195
|
+
text.gsub!(/\'([smd]) /i,%q( '\1 ))
|
196
|
+
|
197
|
+
#s/\'(ll|re|ve) / \'$1 /ig;
|
198
|
+
text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
|
199
|
+
#s/n\'t / n\'t /ig;
|
200
|
+
text.gsub!(/n\'t /i," n't ")
|
201
|
+
|
202
|
+
#s/ (can)(not) / $1 $2 /ig;
|
203
|
+
text.gsub!(/ (can)(not) /i,' \1 \2 ')
|
204
|
+
#s/ (d\')(ye) / $1 $2 /ig;
|
205
|
+
text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
|
206
|
+
#s/ (gim)(me) / $1 $2 /ig;
|
207
|
+
text.gsub!(/ (gim)(me) /i,' \1 \2 ')
|
208
|
+
#s/ (gon)(na) / $1 $2 /ig;
|
209
|
+
text.gsub!(/ (gon)(na) /i,' \1 \2 ')
|
210
|
+
#s/ (got)(ta) / $1 $2 /ig;
|
211
|
+
text.gsub!(/ (got)(ta) /i,' \1 \2 ')
|
212
|
+
#s/ (lem)(me) / $1 $2 /ig;
|
213
|
+
text.gsub!(/ (lem)(me) /i,' \1 \2 ')
|
214
|
+
#s/ (more)(\'n) / $1 $2 /ig;
|
215
|
+
text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
|
216
|
+
#s/ (\'t)(is|was) / $1 $2 /ig;
|
217
|
+
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
218
|
+
#s/ (wan)(na) / $1 $2 /ig;
|
219
|
+
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
220
|
+
|
221
|
+
text.split(/\s/)
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
end
|
data/lib/rbtagger.rb
ADDED
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/rbtagger.rb'}"
|
9
|
+
puts "Loading rb-brill-tagger gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
data/script/txt2html
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
GEM_NAME = 'rbtagger' # what ppl will type to install your gem
|
4
|
+
RUBYFORGE_PROJECT = 'ruletagger'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
begin
|
8
|
+
require 'newgem'
|
9
|
+
require 'rubyforge'
|
10
|
+
rescue LoadError
|
11
|
+
puts "\n\nGenerating the website requires the newgem RubyGem"
|
12
|
+
puts "Install: gem install newgem\n\n"
|
13
|
+
exit(1)
|
14
|
+
end
|
15
|
+
require 'redcloth'
|
16
|
+
require 'syntax/convertors/html'
|
17
|
+
require 'erb'
|
18
|
+
require File.dirname(__FILE__) + "/../lib/#{GEM_NAME}/version.rb"
|
19
|
+
|
20
|
+
version = RbTagger::VERSION::STRING
|
21
|
+
download = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
|
22
|
+
|
23
|
+
def rubyforge_project_id
|
24
|
+
RubyForge.new.autoconfig["group_ids"][RUBYFORGE_PROJECT]
|
25
|
+
end
|
26
|
+
|
27
|
+
class Fixnum
|
28
|
+
def ordinal
|
29
|
+
# teens
|
30
|
+
return 'th' if (10..19).include?(self % 100)
|
31
|
+
# others
|
32
|
+
case self % 10
|
33
|
+
when 1: return 'st'
|
34
|
+
when 2: return 'nd'
|
35
|
+
when 3: return 'rd'
|
36
|
+
else return 'th'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class Time
|
42
|
+
def pretty
|
43
|
+
return "#{mday}#{mday.ordinal} #{strftime('%B')} #{year}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def convert_syntax(syntax, source)
|
48
|
+
return Syntax::Convertors::HTML.for_syntax(syntax).convert(source).gsub(%r!^<pre>|</pre>$!,'')
|
49
|
+
end
|
50
|
+
|
51
|
+
if ARGV.length >= 1
|
52
|
+
src, template = ARGV
|
53
|
+
template ||= File.join(File.dirname(__FILE__), '/../website/template.html.erb')
|
54
|
+
else
|
55
|
+
puts("Usage: #{File.split($0).last} source.txt [template.html.erb] > output.html")
|
56
|
+
exit!
|
57
|
+
end
|
58
|
+
|
59
|
+
template = ERB.new(File.open(template).read)
|
60
|
+
|
61
|
+
title = nil
|
62
|
+
body = nil
|
63
|
+
File.open(src) do |fsrc|
|
64
|
+
title_text = fsrc.readline
|
65
|
+
body_text_template = fsrc.read
|
66
|
+
body_text = ERB.new(body_text_template).result(binding)
|
67
|
+
syntax_items = []
|
68
|
+
body_text.gsub!(%r!<(pre|code)[^>]*?syntax=['"]([^'"]+)[^>]*>(.*?)</\1>!m){
|
69
|
+
ident = syntax_items.length
|
70
|
+
element, syntax, source = $1, $2, $3
|
71
|
+
syntax_items << "<#{element} class='syntax'>#{convert_syntax(syntax, source)}</#{element}>"
|
72
|
+
"syntax-temp-#{ident}"
|
73
|
+
}
|
74
|
+
title = RedCloth.new(title_text).to_html.gsub(%r!<.*?>!,'').strip
|
75
|
+
body = RedCloth.new(body_text).to_html
|
76
|
+
body.gsub!(%r!(?:<pre><code>)?syntax-temp-(\d+)(?:</code></pre>)?!){ syntax_items[$1.to_i] }
|
77
|
+
end
|
78
|
+
stat = File.stat(src)
|
79
|
+
created = stat.ctime
|
80
|
+
modified = stat.mtime
|
81
|
+
|
82
|
+
$stdout << template.result(binding)
|