rbtagger 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +3 -0
- data/lib/brill/tagger.rb +14 -12
- data/lib/rbtagger.rb +1 -0
- data/lib/rbtagger/version.rb +2 -1
- data/lib/word/tagger.rb +1 -0
- metadata +17 -5
data/Rakefile
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
require 'rake/clean'
|
2
2
|
require 'rake/testtask'
|
3
3
|
require 'rake/rdoctask'
|
4
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
5
|
+
$:.unshift File.expand_path(File.dirname(__FILE__), 'lib')
|
6
|
+
$:.unshift File.expand_path(File.dirname(__FILE__), 'ext')
|
4
7
|
|
5
8
|
CLEAN.include '**/*.o'
|
6
9
|
CLEAN.include "**/*.#{Config::MAKEFILE_CONFIG['DLEXT']}"
|
data/lib/brill/tagger.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'rule_tagger/rule_tagger'
|
2
4
|
|
3
5
|
module Brill
|
@@ -29,7 +31,7 @@ module Brill
|
|
29
31
|
# see: http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/Tagger.pm
|
30
32
|
def noun_phrases(text)
|
31
33
|
# ?:$PREP|$DET|$NUM)
|
32
|
-
#
|
34
|
+
#
|
33
35
|
tags = tag(text.gsub(/[^\w]/,' '))
|
34
36
|
phrases = []
|
35
37
|
phrase = []
|
@@ -62,7 +64,7 @@ module Brill
|
|
62
64
|
# join NNP's together for names
|
63
65
|
reduced_tags = []
|
64
66
|
mappings = {} # keep a mapping of the joined words to expand
|
65
|
-
tags.each{|tag|
|
67
|
+
tags.each{|tag|
|
66
68
|
if ptag.last == 'NNP' and tag.last == 'NNP' and !ptag.first.match(/\.$/)
|
67
69
|
ptag[0] += " " + tag.first
|
68
70
|
# before combining these two create a mapping for each word to each word
|
@@ -119,7 +121,7 @@ module Brill
|
|
119
121
|
end
|
120
122
|
|
121
123
|
# Tag a body of text
|
122
|
-
# returns an array like [[token,tag],[token,tag]...[token,tag]]
|
124
|
+
# returns an array like [[token,tag],[token,tag]...[token,tag]]
|
123
125
|
#
|
124
126
|
def tag( text )
|
125
127
|
# XXX: the list of contractions is much larger then this... find'em
|
@@ -130,14 +132,14 @@ module Brill
|
|
130
132
|
|
131
133
|
@tagger.apply_lexical_rules( tokens, tags, [], 0 )
|
132
134
|
@tagger.default_tag_finish( tokens, tags )
|
133
|
-
|
135
|
+
|
134
136
|
# Brill uses these fake "STAART" tags to delimit the start & end of sentence.
|
135
|
-
tokens << "STAART"
|
136
|
-
tokens << "STAART"
|
137
|
+
tokens << "STAART"
|
138
|
+
tokens << "STAART"
|
137
139
|
tokens.unshift "STAART"
|
138
140
|
tokens.unshift "STAART"
|
139
|
-
tags << "STAART"
|
140
|
-
tags << "STAART"
|
141
|
+
tags << "STAART"
|
142
|
+
tags << "STAART"
|
141
143
|
tags.unshift "STAART"
|
142
144
|
tags.unshift "STAART"
|
143
145
|
|
@@ -166,7 +168,7 @@ module Brill
|
|
166
168
|
end
|
167
169
|
lines
|
168
170
|
end
|
169
|
-
# load LEXICON
|
171
|
+
# load LEXICON
|
170
172
|
def self.load_lexicon(tagger,lexicon)
|
171
173
|
lines = Brill::Tagger.lines(lexicon)
|
172
174
|
i = 0
|
@@ -187,7 +189,7 @@ module Brill
|
|
187
189
|
end
|
188
190
|
end
|
189
191
|
|
190
|
-
# load LEXICALRULEFILE
|
192
|
+
# load LEXICALRULEFILE
|
191
193
|
def self.load_lexical_rules(tagger,rules)
|
192
194
|
lines = self.lines(rules)
|
193
195
|
i = 0
|
@@ -273,7 +275,7 @@ module Brill
|
|
273
275
|
# Isolate any embedded punctuation chars
|
274
276
|
# s{([,;:\@\#\$\%&])} { $1 }g;
|
275
277
|
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
276
|
-
|
278
|
+
|
277
279
|
# Assume sentence tokenization has been done first, so split FINAL
|
278
280
|
# periods only.
|
279
281
|
# s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
|
@@ -329,7 +331,7 @@ module Brill
|
|
329
331
|
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
330
332
|
#s/ (wan)(na) / $1 $2 /ig;
|
331
333
|
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
332
|
-
|
334
|
+
|
333
335
|
text.split(/\s/)
|
334
336
|
end
|
335
337
|
|
data/lib/rbtagger.rb
CHANGED
data/lib/rbtagger/version.rb
CHANGED
data/lib/word/tagger.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 1
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 4
|
9
|
+
- 7
|
10
|
+
version: 0.4.7
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Todd A. Fisher
|
@@ -9,7 +15,7 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
18
|
+
date: 2011-03-30 00:00:00 -04:00
|
13
19
|
default_executable:
|
14
20
|
dependencies: []
|
15
21
|
|
@@ -76,21 +82,27 @@ require_paths:
|
|
76
82
|
- lib
|
77
83
|
- ext
|
78
84
|
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
79
86
|
requirements:
|
80
87
|
- - ">="
|
81
88
|
- !ruby/object:Gem::Version
|
89
|
+
hash: 3
|
90
|
+
segments:
|
91
|
+
- 0
|
82
92
|
version: "0"
|
83
|
-
version:
|
84
93
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
85
95
|
requirements:
|
86
96
|
- - ">="
|
87
97
|
- !ruby/object:Gem::Version
|
98
|
+
hash: 3
|
99
|
+
segments:
|
100
|
+
- 0
|
88
101
|
version: "0"
|
89
|
-
version:
|
90
102
|
requirements: []
|
91
103
|
|
92
104
|
rubyforge_project: curb
|
93
|
-
rubygems_version: 1.3.
|
105
|
+
rubygems_version: 1.3.7
|
94
106
|
signing_key:
|
95
107
|
specification_version: 3
|
96
108
|
summary: Ruby libcurl bindings
|