term-extract 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.markdown CHANGED
@@ -25,6 +25,7 @@ The #extract method takes an (optional) options hash, that allows the term extra
25
25
  * min_terms - Always include multiword terms that comprise more than @min_terms words, default 2
26
26
  * types - Extract proper nouns (:nnp) or nouns (:nn) or both (:all), default :all
27
27
  * include_tags - Include the extracted POS tags in the results, default false
28
+ * collapse_terms - Remove shorter terms that are part of larger ones, default true
28
29
 
29
30
  Sample usage:
30
31
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.4.0
data/lib/term-extract.rb CHANGED
@@ -27,6 +27,8 @@ class TermExtract
27
27
  @types = options.key?(:types) ? options.delete(:types) : :all
28
28
  # Include the extracted POS tags in the results
29
29
  @include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
30
+ # Remove shorter terms that are part of larger ones
31
+ @collapse_terms = options.key?(:collapse_terms) ? options.delete(:collapse_terms) : true
30
32
  #@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
31
33
  end
32
34
 
@@ -97,6 +99,16 @@ class TermExtract
97
99
  terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
98
100
  end
99
101
 
102
+ # Remove shorter terms that form part of larger terms
103
+ # This typically removes surname references when we already have a full name
104
+ if @collapse_terms
105
+ terms.each_key do |term1|
106
+ terms.each_key do |term2|
107
+ terms.delete(term2) if term1.length > term2.length && (term1 =~ /[^A-Za-z0-9]#{term2}$/ || term1 =~ /^#{term2}[^A-Za-z0-9]/)
108
+ end
109
+ end
110
+ end
111
+
100
112
  # Filter out tags unless required
101
113
  unless @include_tags
102
114
  terms.each_key { |term| terms[term] = terms[term][:occurances] }
data/term-extract.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{term-extract}
8
- s.version = "0.3.0"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["rattle"]
@@ -1,4 +1,5 @@
1
1
  require 'helper'
2
+ require 'pp'
2
3
 
3
4
  class TestTermExtract < Test::Unit::TestCase
4
5
 
@@ -72,7 +73,7 @@ SOURCE
72
73
  'Palestinians hope',
73
74
  'Richard Makepeace',
74
75
  'court order',
75
- 'event',
76
+ #'event',
76
77
  'literature festival',
77
78
  'peace accords',
78
79
  'police notice',
@@ -108,17 +109,19 @@ SOURCE
108
109
  assert terms.keys.include?("St Paul's Cathedral")
109
110
  end
110
111
 
111
- should "extract terms with joining words" do
112
+ should "extract terms with prepositions" do
112
113
  terms = @te.extract(@@DOC2)
113
114
  assert terms.keys.include?("Secretary of State Owen Paterson")
114
115
  end
115
116
 
116
- should "extract terms and include pos tags when configured to" do
117
- @te.include_tags = true
118
- terms = @te.extract(@@DOCUMENT)
119
- term = terms.keys.first
120
- assert terms[term].key?(:tag)
121
- assert terms[term][:tag]
117
+ should "extract terms with long prepositions" do
118
+ terms = @te.extract(@@DOC2)
119
+ assert terms.keys.include?("Chair of the Parades Commission for Northern Ireland")
120
+ end
121
+
122
+ should "collapse duplicate terms" do
123
+ terms = @te.extract(@@DOC2)
124
+ assert !terms.keys.include?("event")
122
125
  end
123
126
 
124
127
  should "extract common nouns when configured to" do
@@ -162,8 +165,9 @@ SOURCE
162
165
 
163
166
  should "include pos tags in the results" do
164
167
  terms = @te.extract(@@DOCUMENT)
165
- assert terms.keys.include?("Jerusalem")
166
- assert terms['Jerusalem'][:tag] == 'NNP'
168
+ term = terms.keys.first
169
+ assert terms[term].key?(:tag)
170
+ assert terms[term][:tag]
167
171
  end
168
172
 
169
173
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: term-extract
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 3
8
+ - 4
9
9
  - 0
10
- version: 0.3.0
10
+ version: 0.4.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - rattle