term-extract 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.1
1
+ 0.3.0
data/lib/term-extract.rb CHANGED
@@ -32,22 +32,13 @@ class TermExtract
32
32
 
33
33
  def extract(content)
34
34
 
35
- tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
36
-
37
35
  # Tidy content punctuation
38
36
  # Add a space after periods
39
37
  content.gsub!(/([A-Za-z0-9])\./, '\1. ')
40
- # Add in full stops to tag list to allow multiterms to work
41
- tags = []
42
- tagger.tag(content).each do |tag|
43
- if tag[0] =~ /\.$/
44
- tag[0].chop!
45
- tags.push tag
46
- tags.push ['.', '.']
47
- else
48
- tags.push tag
49
- end
50
- end
38
+
39
+ # Assign POS tags and tidy tag stack
40
+ tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
41
+ tags = preprocess_tags(tagger.tag(content))
51
42
 
52
43
  # Set pos tags that identify nouns
53
44
  pos = "^NN"
@@ -79,7 +70,6 @@ class TermExtract
79
70
  multiterm << [term,tag]
80
71
  elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
81
72
  # Allow preposition : "Secretary of State"
82
- # Doesn't support "Chair of the Parades Commission"
83
73
  # Only use when in NNP mode
84
74
  multiterm << [term,tag]
85
75
  elsif state == @@NOUN and tag =~ /#{pos}/
@@ -115,6 +105,29 @@ class TermExtract
115
105
  end
116
106
 
117
107
  protected
108
+ def preprocess_tags(pos)
109
+ # Add in full stops to tag list to allow multiterms to work
110
+ tags = []
111
+ pos.each do |tag|
112
+ if tag[0] =~ /\.$/
113
+ tag[0].chop!
114
+ tags.push tag
115
+ tags.push ['.', '.']
116
+ else
117
+ tags.push tag
118
+ end
119
+ end
120
+ # Join certain prepositions together to allow them to be extracted
121
+ # e.g. allows 'News of the World' to be extracted
122
+ tags.each_with_index do |tag, index|
123
+ if tag[0] == 'of' && (index + 1) < tags.length && tags[index+1][0] == 'the'
124
+ tags[index][0] = 'of the'
125
+ tags.delete_at(index+1)
126
+ end
127
+ end
128
+ tags
129
+ end
130
+
118
131
  def add_term(term, tag, multiterm, terms)
119
132
  multiterm << ([term, tag])
120
133
  increment_term(term, tag, terms)
data/term-extract.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{term-extract}
8
- s.version = "0.2.1"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["rattle"]
12
- s.date = %q{2010-12-23}
12
+ s.date = %q{2010-12-24}
13
13
  s.email = %q{robl@rjlee.net}
14
14
  s.extra_rdoc_files = [
15
15
  "LICENSE.txt",
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: term-extract
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 1
10
- version: 0.2.1
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - rattle
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-23 00:00:00 +00:00
18
+ date: 2010-12-24 00:00:00 +00:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency