term-extract 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.1
1
+ 0.3.0
data/lib/term-extract.rb CHANGED
@@ -32,22 +32,13 @@ class TermExtract
32
32
 
33
33
  def extract(content)
34
34
 
35
- tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
36
-
37
35
  # Tidy content punctuation
38
36
  # Add a space after periods
39
37
  content.gsub!(/([A-Za-z0-9])\./, '\1. ')
40
- # Add in full stops to tag list to allow multiterms to work
41
- tags = []
42
- tagger.tag(content).each do |tag|
43
- if tag[0] =~ /\.$/
44
- tag[0].chop!
45
- tags.push tag
46
- tags.push ['.', '.']
47
- else
48
- tags.push tag
49
- end
50
- end
38
+
39
+ # Assign POS tags and tidy tag stack
40
+ tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
41
+ tags = preprocess_tags(tagger.tag(content))
51
42
 
52
43
  # Set pos tags that identify nouns
53
44
  pos = "^NN"
@@ -79,7 +70,6 @@ class TermExtract
79
70
  multiterm << [term,tag]
80
71
  elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
81
72
  # Allow preposition : "Secretary of State"
82
- # Doesn't support "Chair of the Parades Commission"
83
73
  # Only use when in NNP mode
84
74
  multiterm << [term,tag]
85
75
  elsif state == @@NOUN and tag =~ /#{pos}/
@@ -115,6 +105,29 @@ class TermExtract
115
105
  end
116
106
 
117
107
  protected
108
+ def preprocess_tags(pos)
109
+ # Add in full stops to tag list to allow multiterms to work
110
+ tags = []
111
+ pos.each do |tag|
112
+ if tag[0] =~ /\.$/
113
+ tag[0].chop!
114
+ tags.push tag
115
+ tags.push ['.', '.']
116
+ else
117
+ tags.push tag
118
+ end
119
+ end
120
+ # Join certain prepositions together to allow them to be extracted
121
+ # e.g. allows 'News of the World' to be extracted
122
+ tags.each_with_index do |tag, index|
123
+ if tag[0] == 'of' && (index + 1) < tags.length && tags[index+1][0] == 'the'
124
+ tags[index][0] = 'of the'
125
+ tags.delete_at(index+1)
126
+ end
127
+ end
128
+ tags
129
+ end
130
+
118
131
  def add_term(term, tag, multiterm, terms)
119
132
  multiterm << ([term, tag])
120
133
  increment_term(term, tag, terms)
data/term-extract.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{term-extract}
8
- s.version = "0.2.1"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["rattle"]
12
- s.date = %q{2010-12-23}
12
+ s.date = %q{2010-12-24}
13
13
  s.email = %q{robl@rjlee.net}
14
14
  s.extra_rdoc_files = [
15
15
  "LICENSE.txt",
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: term-extract
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 1
10
- version: 0.2.1
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - rattle
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-23 00:00:00 +00:00
18
+ date: 2010-12-24 00:00:00 +00:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency