rbtagger 0.4.6 → 0.4.7

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,6 +1,9 @@
1
1
  require 'rake/clean'
2
2
  require 'rake/testtask'
3
3
  require 'rake/rdoctask'
4
+ $:.unshift File.expand_path(File.dirname(__FILE__))
5
+ $:.unshift File.expand_path(File.dirname(__FILE__), 'lib')
6
+ $:.unshift File.expand_path(File.dirname(__FILE__), 'ext')
4
7
 
5
8
  CLEAN.include '**/*.o'
6
9
  CLEAN.include "**/*.#{Config::MAKEFILE_CONFIG['DLEXT']}"
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'rule_tagger/rule_tagger'
2
4
 
3
5
  module Brill
@@ -29,7 +31,7 @@ module Brill
29
31
  # see: http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/Tagger.pm
30
32
  def noun_phrases(text)
31
33
  # ?:$PREP|$DET|$NUM)
32
- #
34
+ #
33
35
  tags = tag(text.gsub(/[^\w]/,' '))
34
36
  phrases = []
35
37
  phrase = []
@@ -62,7 +64,7 @@ module Brill
62
64
  # join NNP's together for names
63
65
  reduced_tags = []
64
66
  mappings = {} # keep a mapping of the joined words to expand
65
- tags.each{|tag|
67
+ tags.each{|tag|
66
68
  if ptag.last == 'NNP' and tag.last == 'NNP' and !ptag.first.match(/\.$/)
67
69
  ptag[0] += " " + tag.first
68
70
  # before combining these two create a mapping for each word to each word
@@ -119,7 +121,7 @@ module Brill
119
121
  end
120
122
 
121
123
  # Tag a body of text
122
- # returns an array like [[token,tag],[token,tag]...[token,tag]]
124
+ # returns an array like [[token,tag],[token,tag]...[token,tag]]
123
125
  #
124
126
  def tag( text )
125
127
  # XXX: the list of contractions is much larger then this... find'em
@@ -130,14 +132,14 @@ module Brill
130
132
 
131
133
  @tagger.apply_lexical_rules( tokens, tags, [], 0 )
132
134
  @tagger.default_tag_finish( tokens, tags )
133
-
135
+
134
136
  # Brill uses these fake "STAART" tags to delimit the start & end of sentence.
135
- tokens << "STAART"
136
- tokens << "STAART"
137
+ tokens << "STAART"
138
+ tokens << "STAART"
137
139
  tokens.unshift "STAART"
138
140
  tokens.unshift "STAART"
139
- tags << "STAART"
140
- tags << "STAART"
141
+ tags << "STAART"
142
+ tags << "STAART"
141
143
  tags.unshift "STAART"
142
144
  tags.unshift "STAART"
143
145
 
@@ -166,7 +168,7 @@ module Brill
166
168
  end
167
169
  lines
168
170
  end
169
- # load LEXICON
171
+ # load LEXICON
170
172
  def self.load_lexicon(tagger,lexicon)
171
173
  lines = Brill::Tagger.lines(lexicon)
172
174
  i = 0
@@ -187,7 +189,7 @@ module Brill
187
189
  end
188
190
  end
189
191
 
190
- # load LEXICALRULEFILE
192
+ # load LEXICALRULEFILE
191
193
  def self.load_lexical_rules(tagger,rules)
192
194
  lines = self.lines(rules)
193
195
  i = 0
@@ -273,7 +275,7 @@ module Brill
273
275
  # Isolate any embedded punctuation chars
274
276
  # s{([,;:\@\#\$\%&])} { $1 }g;
275
277
  text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
276
-
278
+
277
279
  # Assume sentence tokenization has been done first, so split FINAL
278
280
  # periods only.
279
281
  # s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
@@ -329,7 +331,7 @@ module Brill
329
331
  text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
330
332
  #s/ (wan)(na) / $1 $2 /ig;
331
333
  text.gsub!(/ (wan)(na) /i,' \1 \2 ')
332
-
334
+
333
335
  text.split(/\s/)
334
336
  end
335
337
 
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  module RbTagger
2
3
  class << self
3
4
  def tags_from_file( file )
@@ -1,8 +1,9 @@
1
+ # encoding: utf-8
1
2
  module RbTagger #:nodoc:
2
3
  module VERSION #:nodoc:
3
4
  MAJOR = 0
4
5
  MINOR = 4
5
- TINY = 6
6
+ TINY = 7
6
7
 
7
8
  STRING = [MAJOR, MINOR, TINY].join('.')
8
9
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'word_tagger/word_tagger'
2
3
 
3
4
  module Word
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.6
4
+ hash: 1
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 4
9
+ - 7
10
+ version: 0.4.7
5
11
  platform: ruby
6
12
  authors:
7
13
  - Todd A. Fisher
@@ -9,7 +15,7 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2010-01-25 00:00:00 -05:00
18
+ date: 2011-03-30 00:00:00 -04:00
13
19
  default_executable:
14
20
  dependencies: []
15
21
 
@@ -76,21 +82,27 @@ require_paths:
76
82
  - lib
77
83
  - ext
78
84
  required_ruby_version: !ruby/object:Gem::Requirement
85
+ none: false
79
86
  requirements:
80
87
  - - ">="
81
88
  - !ruby/object:Gem::Version
89
+ hash: 3
90
+ segments:
91
+ - 0
82
92
  version: "0"
83
- version:
84
93
  required_rubygems_version: !ruby/object:Gem::Requirement
94
+ none: false
85
95
  requirements:
86
96
  - - ">="
87
97
  - !ruby/object:Gem::Version
98
+ hash: 3
99
+ segments:
100
+ - 0
88
101
  version: "0"
89
- version:
90
102
  requirements: []
91
103
 
92
104
  rubyforge_project: curb
93
- rubygems_version: 1.3.5
105
+ rubygems_version: 1.3.7
94
106
  signing_key:
95
107
  specification_version: 3
96
108
  summary: Ruby libcurl bindings