engtagger 0.1.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -21,7 +21,7 @@ EOD
21
21
  @tagger = EngTagger.new
22
22
  tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
23
23
  wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
24
- if !File.exists?(tagpath) or !File.exists?(wordpath)
24
+ if !File.exist?(tagpath) or !File.exists?(wordpath)
25
25
  @tagger.install
26
26
  end
27
27
  end
@@ -69,7 +69,7 @@ EOD
69
69
  def test_clean_text
70
70
  test = "<html><body>I am <b>100% sure</b> that Dr. Watson is too naive. I'm sorry.</body></html>"
71
71
  model = ["I","am","100","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
72
- assert_equal(model, @tagger.clean_text(test))
72
+ assert_equal(model, @tagger.clean_text(test)) unless $no_hpricot
73
73
  end
74
74
 
75
75
  def test_clean_word
@@ -102,6 +102,38 @@ EOD
102
102
  assert_instance_of(Hash, result)
103
103
  end
104
104
 
105
+ def test_get_verbs
106
+ expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
107
+ result = @tagger.get_verbs(@@tagged)
108
+ assert_equal(expected_result, result)
109
+ end
110
+
111
+ def test_get_adverbs
112
+ expected_result = { "otherwise" => 1 }
113
+ result = @tagger.get_adverbs(@@tagged)
114
+ assert_equal(expected_result, result)
115
+ end
116
+
117
+ def test_get_interrogatives
118
+ tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
119
+ expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
120
+ result = @tagger.get_interrogatives(tagged)
121
+ assert_equal(expected_result, result)
122
+ end
123
+
124
+ def test_get_question_parts
125
+ tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
126
+ expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
127
+ result = @tagger.get_question_parts(tagged)
128
+ assert_equal(expected_result, result)
129
+ end
130
+
131
+ def test_get_conjunctions
132
+ expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
133
+ result = @tagger.get_conjunctions(@@tagged)
134
+ assert_equal(expected_result, result)
135
+ end
136
+
105
137
  def test_get_proper_nouns
106
138
  test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
107
139
  result = @tagger.get_proper_nouns(test)
@@ -191,6 +223,11 @@ EOD
191
223
  text = ""
192
224
  assert(!@tagger.valid_text(text))
193
225
  end
226
+
227
+ def test_override_default_params
228
+ @tagger = EngTagger.new(:longest_noun_phrase => 3)
229
+ assert_equal 3, @tagger.conf[:longest_noun_phrase]
230
+ end
194
231
  end
195
232
 
196
233
  # Number of errors detected: 24
metadata CHANGED
@@ -1,86 +1,60 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: engtagger
3
- version: !ruby/object:Gem::Version
4
- version: 0.1.1
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.2
5
5
  platform: ruby
6
- authors:
6
+ authors:
7
7
  - Yoichiro Hasebe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
-
12
- date: 2008-05-15 00:00:00 +09:00
13
- default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: hpricot
17
- version_requirement:
18
- version_requirements: !ruby/object:Gem::Requirement
19
- requirements:
20
- - - ">="
21
- - !ruby/object:Gem::Version
22
- version: "0"
23
- version:
24
- - !ruby/object:Gem::Dependency
25
- name: hoe
26
- version_requirement:
27
- version_requirements: !ruby/object:Gem::Requirement
28
- requirements:
29
- - - ">="
30
- - !ruby/object:Gem::Version
31
- version: 1.5.1
32
- version:
33
- description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values. The tagger assigns appropriate tags based on conditional probabilities--it examines the preceding tag to determine the appropriate tag for the current word. Unknown words are classified according to word morphology or can be set to be treated as nouns or other parts of speech. The tagger also extracts as many nouns and noun phrases as it can, using a set of regular expressions.
34
- email: yohasebe@gmail.com
11
+ date: 2022-02-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
14
+ tagger that assigns POS tags to English text based on a lookup dictionary and a
15
+ set of probability values.
16
+ email:
17
+ - yohasebe@gmail.com
35
18
  executables: []
36
-
37
19
  extensions: []
38
-
39
- extra_rdoc_files:
40
- - History.txt
41
- - LICENSE.txt
42
- - Manifest.txt
43
- - README.txt
44
- files:
45
- - History.txt
46
- - LICENSE.txt
47
- - Manifest.txt
48
- - README.txt
20
+ extra_rdoc_files: []
21
+ files:
22
+ - ".gitignore"
23
+ - Gemfile
24
+ - LICENSE
25
+ - README.md
49
26
  - Rakefile
27
+ - engtagger.gemspec
50
28
  - lib/engtagger.rb
51
29
  - lib/engtagger/porter.rb
52
30
  - lib/engtagger/pos_tags.hash
53
31
  - lib/engtagger/pos_words.hash
54
32
  - lib/engtagger/tags.yml
55
33
  - lib/engtagger/unknown.yml
34
+ - lib/engtagger/version.rb
56
35
  - lib/engtagger/words.yml
57
36
  - test/test_engtagger.rb
58
- has_rdoc: true
59
- homepage: http://engtagger.rubyforge.org
60
- post_install_message:
61
- rdoc_options:
62
- - --main
63
- - README.txt
64
- require_paths:
37
+ homepage: http://github.com/yohasebe/engtagger
38
+ licenses: []
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
65
43
  - lib
66
- required_ruby_version: !ruby/object:Gem::Requirement
67
- requirements:
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
68
46
  - - ">="
69
- - !ruby/object:Gem::Version
70
- version: "0"
71
- version:
72
- required_rubygems_version: !ruby/object:Gem::Requirement
73
- requirements:
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
74
51
  - - ">="
75
- - !ruby/object:Gem::Version
76
- version: "0"
77
- version:
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
78
54
  requirements: []
79
-
80
- rubyforge_project: engtagger
81
- rubygems_version: 1.1.1
82
- signing_key:
83
- specification_version: 2
84
- summary: English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
85
- test_files:
55
+ rubygems_version: 3.3.3
56
+ signing_key:
57
+ specification_version: 4
58
+ summary: A probability based, corpus-trained English POS tagger
59
+ test_files:
86
60
  - test/test_engtagger.rb
data/History.txt DELETED
@@ -1,10 +0,0 @@
1
- === 0.1.0 / 2008-05-14
2
-
3
- * Modified Synopsis section of Readme.txt
4
- * Created a description of tag set in Readme.txt
5
- * Fixed a few minor bugs
6
-
7
- === 0.1.0 / 2008-05-06
8
-
9
- * Initial release
10
- * Functionalities are basically the same as those of Perl Lingua::EN::Tagger.
data/Manifest.txt DELETED
@@ -1,13 +0,0 @@
1
- History.txt
2
- LICENSE.txt
3
- Manifest.txt
4
- README.txt
5
- Rakefile
6
- lib/engtagger.rb
7
- lib/engtagger/porter.rb
8
- lib/engtagger/pos_tags.hash
9
- lib/engtagger/pos_words.hash
10
- lib/engtagger/tags.yml
11
- lib/engtagger/unknown.yml
12
- lib/engtagger/words.yml
13
- test/test_engtagger.rb
data/README.txt DELETED
@@ -1,140 +0,0 @@
1
- = EngTagger
2
-
3
- English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
4
-
5
- === Description
6
-
7
- A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
8
- tagger that assigns POS tags to English text based on a lookup dictionary and
9
- a set of probability values. The tagger assigns appropriate tags based on
10
- conditional probabilities--it examines the preceding tag to determine the
11
- appropriate tag for the current word. Unknown words are classified according to
12
- word morphology or can be set to be treated as nouns or other parts of speech.
13
- The tagger also extracts as many nouns and noun phrases as it can, using a set
14
- of regular expressions.
15
-
16
- === Features
17
-
18
- * Assigns POS tags to English text
19
- * Extract noun phrases from tagged text
20
- * etc.
21
-
22
- === Synopsis:
23
-
24
- require 'rubygems'
25
- require 'engtagger'
26
-
27
- # Create a parser object
28
- tgr = EngTagger.new
29
-
30
- # Sample text
31
- text = "Alice chased the big fat cat."
32
-
33
- # Add part-of-speech tags to text
34
- tagged = tgr.add_tags(text)
35
-
36
- #=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
37
-
38
- # Get a list of all nouns and noun phrases with occurrence counts
39
- word_list = tgr.get_words(text)
40
-
41
- #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
42
-
43
- # Get a readable version of the tagged text
44
- readable = tgr.get_readable(text)
45
-
46
- #=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
47
-
48
- # Get all nouns from a tagged output
49
- nouns = tgr.get_nouns(tagged)
50
-
51
- #=> {"cat"=>1, "Alice"=>1}
52
-
53
- # Get all proper nouns
54
- proper = tgr.get_proper_nouns(tagged)
55
-
56
- #=> {"Alice"=>1}
57
-
58
-
59
- # Get all noun phrases of any syntactic level
60
- # (same as word_list but take a tagged input)
61
- nps = tgr.get_noun_phrases(tagged)
62
-
63
- #=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
64
-
65
- === Tag Set
66
-
67
- The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, <DT>.
68
-
69
- CC Conjunction, coordinating and, or
70
- CD Adjective, cardinal number 3, fifteen
71
- DET Determiner this, each, some
72
- EX Pronoun, existential there there
73
- FW Foreign words
74
- IN Preposition / Conjunction for, of, although, that
75
- JJ Adjective happy, bad
76
- JJR Adjective, comparative happier, worse
77
- JJS Adjective, superlative happiest, worst
78
- LS Symbol, list item A, A.
79
- MD Verb, modal can, could, 'll
80
- NN Noun aircraft, data
81
- NNP Noun, proper London, Michael
82
- NNPS Noun, proper, plural Australians, Methodists
83
- NNS Noun, plural women, books
84
- PDT Determiner, prequalifier quite, all, half
85
- POS Possessive 's, '
86
- PRP Determiner, possessive second mine, yours
87
- PRPS Determiner, possessive their, your
88
- RB Adverb often, not, very, here
89
- RBR Adverb, comparative faster
90
- RBS Adverb, superlative fastest
91
- RP Adverb, particle up, off, out
92
- SYM Symbol *
93
- TO Preposition to
94
- UH Interjection oh, yes, mmm
95
- VB Verb, infinitive take, live
96
- VBD Verb, past tense took, lived
97
- VBG Verb, gerund taking, living
98
- VBN Verb, past/passive participle taken, lived
99
- VBP Verb, base present form take, live
100
- VBZ Verb, present 3SG -s form takes, lives
101
- WDT Determiner, question which, whatever
102
- WP Pronoun, question who, whoever
103
- WPS Determiner, possessive & question whose
104
- WRB Adverb, question when, how, however
105
-
106
- PP Punctuation, sentence ender ., !, ?
107
- PPC Punctuation, comma ,
108
- PPD Punctuation, dollar sign $
109
- PPL Punctuation, quotation mark left ``
110
- PPR Punctuation, quotation mark right ''
111
- PPS Punctuation, colon, semicolon, elipsis :, ..., -
112
- LRB Punctuation, left bracket (, {, [
113
- RRB Punctuation, right bracket ), }, ]
114
-
115
- === Requirements
116
-
117
- * Ruby 1.8.6
118
- * Hpricot[http://code.whytheluckystiff.net/hpricot/] (optional)
119
-
120
- === Install
121
-
122
- (sudo) gem install engtagger
123
-
124
- === Author
125
-
126
- of this Ruby library
127
- * Yoichiro Hasebe (yohasebe [at] gmail.com)
128
-
129
- of the original Perl module
130
- * Aaron Coburn (acoburn [at] middlebury.edu)
131
-
132
- === Acknowledgement
133
-
134
- This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
135
- The credit for the crucial part of its algorithm/design therefore goes to
136
- Aaron Coburn, the author of the original Perl version.
137
-
138
- === License
139
-
140
- This library is distributed under the GPL. Please see the LICENSE file.