ruletagger 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +21 -0
- data/History.txt +4 -0
- data/LICENSE +21 -0
- data/License.txt +20 -0
- data/Manifest.txt +75 -0
- data/PostInstall.txt +7 -0
- data/README +7 -0
- data/README.txt +53 -0
- data/Rakefile +33 -0
- data/config/hoe.rb +75 -0
- data/config/requirements.rb +15 -0
- data/ext/rule_tagger/bool.h +38 -0
- data/ext/rule_tagger/darray.c +292 -0
- data/ext/rule_tagger/darray.h +125 -0
- data/ext/rule_tagger/darrayP.h +50 -0
- data/ext/rule_tagger/extconf.rb +14 -0
- data/ext/rule_tagger/lex.c +170 -0
- data/ext/rule_tagger/lex.h +49 -0
- data/ext/rule_tagger/memory.c +127 -0
- data/ext/rule_tagger/memory.h +20 -0
- data/ext/rule_tagger/rbtagger.c +252 -0
- data/ext/rule_tagger/registry.c +326 -0
- data/ext/rule_tagger/registry.h +129 -0
- data/ext/rule_tagger/registryP.h +46 -0
- data/ext/rule_tagger/ruby-compat.h +20 -0
- data/ext/rule_tagger/rules.c +525 -0
- data/ext/rule_tagger/rules.h +42 -0
- data/ext/rule_tagger/sysdep.h +20 -0
- data/ext/rule_tagger/tagger.c +110 -0
- data/ext/rule_tagger/tagger.h +46 -0
- data/ext/rule_tagger/useful.c +44 -0
- data/ext/rule_tagger/useful.h +51 -0
- data/ext/word_tagger/extconf.rb +7 -0
- data/ext/word_tagger/porter_stemmer.c +430 -0
- data/ext/word_tagger/porter_stemmer.h +19 -0
- data/ext/word_tagger/rtagger.cc +83 -0
- data/ext/word_tagger/tagger.cc +153 -0
- data/ext/word_tagger/tagger.h +27 -0
- data/ext/word_tagger/tagger.rb +8 -0
- data/ext/word_tagger/test/Makefile +22 -0
- data/ext/word_tagger/test/doc.txt +87 -0
- data/ext/word_tagger/test/test.cc +107 -0
- data/ext/word_tagger/test.rb +31 -0
- data/lib/brill/tagger.rb +225 -0
- data/lib/rbtagger/version.rb +9 -0
- data/lib/rbtagger.rb +6 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/website.rake +17 -0
- data/test/CONTEXTUALRULEFILE +284 -0
- data/test/LEXICALRULEFILE +148 -0
- data/test/LEXICON +93696 -0
- data/test/docs/doc0.txt +20 -0
- data/test/docs/doc1.txt +11 -0
- data/test/docs/doc2.txt +52 -0
- data/test/docs/doc3.txt +128 -0
- data/test/docs/doc4.txt +337 -0
- data/test/docs/doc5.txt +497 -0
- data/test/docs/doc6.txt +116 -0
- data/test/docs/doc7.txt +101 -0
- data/test/docs/doc8.txt +25 -0
- data/test/docs/doc9.txt +84 -0
- data/test/tagger_test.rb +60 -0
- data/test/test_helper.rb +2 -0
- data/tools/rakehelp.rb +113 -0
- data/website/index.html +113 -0
- data/website/index.txt +53 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.html.erb +48 -0
- metadata +155 -0
data/lib/brill/tagger.rb
ADDED
@@ -0,0 +1,225 @@
|
|
1
|
+
require 'rule_tagger'
|
2
|
+
|
3
|
+
module Brill
|
4
|
+
class Tagger
|
5
|
+
def initialize( lexicon, lexical_rules, contextual_rules )
|
6
|
+
@tagger = ::Tagger::BrillTagger.new
|
7
|
+
Brill::Tagger.load_lexicon(@tagger,lexicon)
|
8
|
+
Brill::Tagger.load_lexical_rules(@tagger,lexical_rules)
|
9
|
+
Brill::Tagger.load_contextual_rules(@tagger,contextual_rules)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Tag a body of text
|
13
|
+
# returns an array like [[token,tag],[token,tag]...[token,tag]]
|
14
|
+
#
|
15
|
+
def tag( text )
|
16
|
+
tokens = Brill::Tagger.tokenize( text )
|
17
|
+
tags = Brill::Tagger.tag_start( tokens )
|
18
|
+
|
19
|
+
@tagger.apply_lexical_rules( tokens, tags, [], 0 )
|
20
|
+
@tagger.default_tag_finish( tokens, tags )
|
21
|
+
|
22
|
+
# Brill uses these fake "STAART" tags to delimit the start & end of sentence.
|
23
|
+
tokens << "STAART"
|
24
|
+
tokens << "STAART"
|
25
|
+
tokens.unshift "STAART"
|
26
|
+
tokens.unshift "STAART"
|
27
|
+
tags << "STAART"
|
28
|
+
tags << "STAART"
|
29
|
+
tags.unshift "STAART"
|
30
|
+
tags.unshift "STAART"
|
31
|
+
|
32
|
+
@tagger.apply_contextual_rules( tokens, tags, 1 )
|
33
|
+
|
34
|
+
tags.shift
|
35
|
+
tags.shift
|
36
|
+
tokens.shift
|
37
|
+
tokens.shift
|
38
|
+
tags.pop
|
39
|
+
tags.pop
|
40
|
+
tokens.pop
|
41
|
+
tokens.pop
|
42
|
+
|
43
|
+
pairs = []
|
44
|
+
tokens.each_with_index do|t,i|
|
45
|
+
pairs << [t,tags[i]]
|
46
|
+
end
|
47
|
+
pairs
|
48
|
+
end
|
49
|
+
private
|
50
|
+
def self.lines( file )
|
51
|
+
lines = []
|
52
|
+
File.open(file,'r') do|f|
|
53
|
+
lines = f.readlines
|
54
|
+
end
|
55
|
+
lines
|
56
|
+
end
|
57
|
+
# load LEXICON
|
58
|
+
def self.load_lexicon(tagger,lexicon)
|
59
|
+
lines = Brill::Tagger.lines(lexicon)
|
60
|
+
i = 0
|
61
|
+
count = lines.size
|
62
|
+
while i < count
|
63
|
+
line = lines[i]
|
64
|
+
#puts "line: #{line.inspect}:#{i.inspect}"
|
65
|
+
parts = line.split(/\s/)
|
66
|
+
#puts "word: #{word.inspect}, tags: #{tags.inspect}"
|
67
|
+
word = parts.first
|
68
|
+
tags = parts[1..-1]
|
69
|
+
tagger.add_to_lexicon(word,tags.first)
|
70
|
+
#puts "#{word} => #{tags.inspect}"
|
71
|
+
tags.each do|tag|
|
72
|
+
tagger.add_to_lexicon_tags("#{word} #{tag}")
|
73
|
+
end
|
74
|
+
i += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# load LEXICALRULEFILE
|
79
|
+
def self.load_lexical_rules(tagger,rules)
|
80
|
+
lines = self.lines(rules)
|
81
|
+
i = 0
|
82
|
+
count = lines.size
|
83
|
+
=begin
|
84
|
+
# original perl
|
85
|
+
chomp;
|
86
|
+
my @line = split or next;
|
87
|
+
$self->_add_lexical_rule($_);
|
88
|
+
|
89
|
+
if ($line[1] eq 'goodright') {
|
90
|
+
$self->_add_goodright($line[0]);
|
91
|
+
} elsif ($line[2] eq 'fgoodright') {
|
92
|
+
$self->_add_goodright($line[1]);
|
93
|
+
} elsif ($line[1] eq 'goodleft') {
|
94
|
+
$self->_add_goodleft($line[0]);
|
95
|
+
} elsif ($line[2] eq 'fgoodleft') {
|
96
|
+
$self->_add_goodleft($line[1]);
|
97
|
+
}
|
98
|
+
=end
|
99
|
+
while i < count
|
100
|
+
line = lines[i].chomp
|
101
|
+
cols = line.split(/\s/)
|
102
|
+
next unless line.size > 0
|
103
|
+
tagger.add_lexical_rule(line)
|
104
|
+
if cols[1] == 'goodright'
|
105
|
+
tagger.add_goodright(cols[0])
|
106
|
+
elsif cols[2] == 'fgoodright'
|
107
|
+
tagger.add_goodright(cols[1])
|
108
|
+
elsif cols[1] == 'goodleft'
|
109
|
+
tagger.add_goodleft(cols[0])
|
110
|
+
elsif cols[2] == 'fgoodleft'
|
111
|
+
tagger.add_goodleft(cols[1])
|
112
|
+
end
|
113
|
+
|
114
|
+
i += 1
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# load CONTEXTUALRULEFILE
|
119
|
+
def self.load_contextual_rules(tagger,rules)
|
120
|
+
lines = self.lines(rules)
|
121
|
+
i = 0
|
122
|
+
count = lines.size
|
123
|
+
while i < count
|
124
|
+
line = lines[i].chomp
|
125
|
+
next unless line.size > 0
|
126
|
+
tagger.add_contextual_rule(line);
|
127
|
+
i += 1
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.tag_start(tokens)
|
132
|
+
tokens.map{|token| token.match(/^[A-Z]/) ? 'NNP' : 'NN' }
|
133
|
+
end
|
134
|
+
|
135
|
+
# this tokenize code is a port from perl
|
136
|
+
def self.tokenize(text)
|
137
|
+
# Normalize all whitespace
|
138
|
+
text = text.gsub(/\s+/,' ')
|
139
|
+
|
140
|
+
# translate some common extended ascii characters to quotes
|
141
|
+
text.gsub!(/#{145.chr}/,'`')
|
142
|
+
text.gsub!(/#{146.chr}/,"'")
|
143
|
+
text.gsub!(/#{147.chr}/,"``")
|
144
|
+
text.gsub!(/#{148.chr}/,"''")
|
145
|
+
|
146
|
+
# Attempt to get correct directional quotes
|
147
|
+
# s{\"\b} { `` }g;
|
148
|
+
text.gsub!(/\"\b/,' `` ')
|
149
|
+
# s{\b\"} { '' }g;
|
150
|
+
text.gsub!(/\b\"/," '' ")
|
151
|
+
#s{\"(?=\s)} { '' }g;
|
152
|
+
text.gsub!(/\"(?=\s)/," '' ")
|
153
|
+
#s{\"} { `` }g;
|
154
|
+
text.gsub!(/\"(?=\s)/," `` ")
|
155
|
+
|
156
|
+
# Isolate ellipses
|
157
|
+
# s{\.\.\.} { ... }g;
|
158
|
+
text.gsub!(/\.\.\./,' ... ')
|
159
|
+
|
160
|
+
|
161
|
+
# Isolate any embedded punctuation chars
|
162
|
+
# s{([,;:\@\#\$\%&])} { $1 }g;
|
163
|
+
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
164
|
+
|
165
|
+
# Assume sentence tokenization has been done first, so split FINAL
|
166
|
+
# periods only.
|
167
|
+
# s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
|
168
|
+
text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
|
169
|
+
|
170
|
+
# however, we may as well split ALL question marks and exclamation points,
|
171
|
+
# since they shouldn't have the abbrev.-marker ambiguity problem
|
172
|
+
#s{([?!])} { $1 }g;
|
173
|
+
text.gsub!(/([?!])/, ' \1 ')
|
174
|
+
|
175
|
+
# parentheses, brackets, etc.
|
176
|
+
#s{([\]\[\(\)\{\}\<\>])} { $1 }g;
|
177
|
+
text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
|
178
|
+
|
179
|
+
#s/(-{2,})/ $1 /g;
|
180
|
+
text.gsub!(/(-{2,})/,' \1 ')
|
181
|
+
|
182
|
+
# Add a space to the beginning and end of each line, to reduce
|
183
|
+
# necessary number of regexps below.
|
184
|
+
#s/$/ /;
|
185
|
+
text.gsub!(/$/," ")
|
186
|
+
#s/^/ /;
|
187
|
+
text.gsub!(/^/," ")
|
188
|
+
|
189
|
+
# possessive or close-single-quote
|
190
|
+
#s/\([^\']\)\' /$1 \' /g;
|
191
|
+
text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
|
192
|
+
|
193
|
+
# as in it's, I'm, we'd
|
194
|
+
#s/\'([smd]) / \'$1 /ig;
|
195
|
+
text.gsub!(/\'([smd]) /i,%q( '\1 ))
|
196
|
+
|
197
|
+
#s/\'(ll|re|ve) / \'$1 /ig;
|
198
|
+
text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
|
199
|
+
#s/n\'t / n\'t /ig;
|
200
|
+
text.gsub!(/n\'t /i," n't ")
|
201
|
+
|
202
|
+
#s/ (can)(not) / $1 $2 /ig;
|
203
|
+
text.gsub!(/ (can)(not) /i,' \1 \2 ')
|
204
|
+
#s/ (d\')(ye) / $1 $2 /ig;
|
205
|
+
text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
|
206
|
+
#s/ (gim)(me) / $1 $2 /ig;
|
207
|
+
text.gsub!(/ (gim)(me) /i,' \1 \2 ')
|
208
|
+
#s/ (gon)(na) / $1 $2 /ig;
|
209
|
+
text.gsub!(/ (gon)(na) /i,' \1 \2 ')
|
210
|
+
#s/ (got)(ta) / $1 $2 /ig;
|
211
|
+
text.gsub!(/ (got)(ta) /i,' \1 \2 ')
|
212
|
+
#s/ (lem)(me) / $1 $2 /ig;
|
213
|
+
text.gsub!(/ (lem)(me) /i,' \1 \2 ')
|
214
|
+
#s/ (more)(\'n) / $1 $2 /ig;
|
215
|
+
text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
|
216
|
+
#s/ (\'t)(is|was) / $1 $2 /ig;
|
217
|
+
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
218
|
+
#s/ (wan)(na) / $1 $2 /ig;
|
219
|
+
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
220
|
+
|
221
|
+
text.split(/\s/)
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
end
|
data/lib/rbtagger.rb
ADDED
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/rbtagger.rb'}"
|
9
|
+
puts "Loading rb-brill-tagger gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
data/script/txt2html
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
GEM_NAME = 'rbtagger' # what ppl will type to install your gem
|
4
|
+
RUBYFORGE_PROJECT = 'ruletagger'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
begin
|
8
|
+
require 'newgem'
|
9
|
+
require 'rubyforge'
|
10
|
+
rescue LoadError
|
11
|
+
puts "\n\nGenerating the website requires the newgem RubyGem"
|
12
|
+
puts "Install: gem install newgem\n\n"
|
13
|
+
exit(1)
|
14
|
+
end
|
15
|
+
require 'redcloth'
|
16
|
+
require 'syntax/convertors/html'
|
17
|
+
require 'erb'
|
18
|
+
require File.dirname(__FILE__) + "/../lib/#{GEM_NAME}/version.rb"
|
19
|
+
|
20
|
+
version = RbTagger::VERSION::STRING
|
21
|
+
download = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
|
22
|
+
|
23
|
+
def rubyforge_project_id
|
24
|
+
RubyForge.new.autoconfig["group_ids"][RUBYFORGE_PROJECT]
|
25
|
+
end
|
26
|
+
|
27
|
+
class Fixnum
|
28
|
+
def ordinal
|
29
|
+
# teens
|
30
|
+
return 'th' if (10..19).include?(self % 100)
|
31
|
+
# others
|
32
|
+
case self % 10
|
33
|
+
when 1: return 'st'
|
34
|
+
when 2: return 'nd'
|
35
|
+
when 3: return 'rd'
|
36
|
+
else return 'th'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class Time
|
42
|
+
def pretty
|
43
|
+
return "#{mday}#{mday.ordinal} #{strftime('%B')} #{year}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def convert_syntax(syntax, source)
|
48
|
+
return Syntax::Convertors::HTML.for_syntax(syntax).convert(source).gsub(%r!^<pre>|</pre>$!,'')
|
49
|
+
end
|
50
|
+
|
51
|
+
if ARGV.length >= 1
|
52
|
+
src, template = ARGV
|
53
|
+
template ||= File.join(File.dirname(__FILE__), '/../website/template.html.erb')
|
54
|
+
else
|
55
|
+
puts("Usage: #{File.split($0).last} source.txt [template.html.erb] > output.html")
|
56
|
+
exit!
|
57
|
+
end
|
58
|
+
|
59
|
+
template = ERB.new(File.open(template).read)
|
60
|
+
|
61
|
+
title = nil
|
62
|
+
body = nil
|
63
|
+
File.open(src) do |fsrc|
|
64
|
+
title_text = fsrc.readline
|
65
|
+
body_text_template = fsrc.read
|
66
|
+
body_text = ERB.new(body_text_template).result(binding)
|
67
|
+
syntax_items = []
|
68
|
+
body_text.gsub!(%r!<(pre|code)[^>]*?syntax=['"]([^'"]+)[^>]*>(.*?)</\1>!m){
|
69
|
+
ident = syntax_items.length
|
70
|
+
element, syntax, source = $1, $2, $3
|
71
|
+
syntax_items << "<#{element} class='syntax'>#{convert_syntax(syntax, source)}</#{element}>"
|
72
|
+
"syntax-temp-#{ident}"
|
73
|
+
}
|
74
|
+
title = RedCloth.new(title_text).to_html.gsub(%r!<.*?>!,'').strip
|
75
|
+
body = RedCloth.new(body_text).to_html
|
76
|
+
body.gsub!(%r!(?:<pre><code>)?syntax-temp-(\d+)(?:</code></pre>)?!){ syntax_items[$1.to_i] }
|
77
|
+
end
|
78
|
+
stat = File.stat(src)
|
79
|
+
created = stat.ctime
|
80
|
+
modified = stat.mtime
|
81
|
+
|
82
|
+
$stdout << template.result(binding)
|