term-extract 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.markdown CHANGED
@@ -25,6 +25,7 @@ The #extract method takes an (optional) options hash, that allows the term extra
25
25
  * min_terms - Always include multiword terms that comprise more than @min_terms words, default 2
26
26
  * types - Extract proper nouns (:nnp) or nouns (:nn) or both (:all), default :all
27
27
  * include_tags - Include the extracted POS tags in the results, default false
28
+ * collapse_terms - Remove shorter terms that are part of larger ones, default true
28
29
 
29
30
  Sample usage:
30
31
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 0.4.0
data/lib/term-extract.rb CHANGED
@@ -27,6 +27,8 @@ class TermExtract
27
27
  @types = options.key?(:types) ? options.delete(:types) : :all
28
28
  # Include the extracted POS tags in the results
29
29
  @include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
30
+ # Remove shorter terms that are part of larger ones
31
+ @collapse_terms = options.key?(:collapse_terms) ? options.delete(:collapse_terms) : true
30
32
  #@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
31
33
  end
32
34
 
@@ -97,6 +99,16 @@ class TermExtract
97
99
  terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
98
100
  end
99
101
 
102
+ # Remove shorter terms that form part of larger terms
103
+ # This typically removes surname references when we already have a full name
104
+ if @collapse_terms
105
+ terms.each_key do |term1|
106
+ terms.each_key do |term2|
107
+ terms.delete(term2) if term1.length > term2.length && (term1 =~ /[^A-Za-z0-9]#{term2}$/ || term1 =~ /^#{term2}[^A-Za-z0-9]/)
108
+ end
109
+ end
110
+ end
111
+
100
112
  # Filter out tags unless required
101
113
  unless @include_tags
102
114
  terms.each_key { |term| terms[term] = terms[term][:occurances] }
data/term-extract.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{term-extract}
8
- s.version = "0.3.0"
8
+ s.version = "0.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["rattle"]
@@ -1,4 +1,5 @@
1
1
  require 'helper'
2
+ require 'pp'
2
3
 
3
4
  class TestTermExtract < Test::Unit::TestCase
4
5
 
@@ -72,7 +73,7 @@ SOURCE
72
73
  'Palestinians hope',
73
74
  'Richard Makepeace',
74
75
  'court order',
75
- 'event',
76
+ #'event',
76
77
  'literature festival',
77
78
  'peace accords',
78
79
  'police notice',
@@ -108,17 +109,19 @@ SOURCE
108
109
  assert terms.keys.include?("St Paul's Cathedral")
109
110
  end
110
111
 
111
- should "extract terms with joining words" do
112
+ should "extract terms with prepositions" do
112
113
  terms = @te.extract(@@DOC2)
113
114
  assert terms.keys.include?("Secretary of State Owen Paterson")
114
115
  end
115
116
 
116
- should "extract terms and include pos tags when configured to" do
117
- @te.include_tags = true
118
- terms = @te.extract(@@DOCUMENT)
119
- term = terms.keys.first
120
- assert terms[term].key?(:tag)
121
- assert terms[term][:tag]
117
+ should "extract terms with long prepositions" do
118
+ terms = @te.extract(@@DOC2)
119
+ assert terms.keys.include?("Chair of the Parades Commission for Northern Ireland")
120
+ end
121
+
122
+ should "collapse duplicate terms" do
123
+ terms = @te.extract(@@DOC2)
124
+ assert !terms.keys.include?("event")
122
125
  end
123
126
 
124
127
  should "extract common nouns when configured to" do
@@ -162,8 +165,9 @@ SOURCE
162
165
 
163
166
  should "include pos tags in the results" do
164
167
  terms = @te.extract(@@DOCUMENT)
165
- assert terms.keys.include?("Jerusalem")
166
- assert terms['Jerusalem'][:tag] == 'NNP'
168
+ term = terms.keys.first
169
+ assert terms[term].key?(:tag)
170
+ assert terms[term][:tag]
167
171
  end
168
172
 
169
173
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: term-extract
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 3
8
+ - 4
9
9
  - 0
10
- version: 0.3.0
10
+ version: 0.4.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - rattle