term-extract 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +1 -0
- data/VERSION +1 -1
- data/lib/term-extract.rb +12 -0
- data/term-extract.gemspec +1 -1
- data/test/test_term-extract.rb +14 -10
- metadata +3 -3
data/README.markdown
CHANGED
@@ -25,6 +25,7 @@ The #extract method takes an (optional) options hash, that allows the term extra
|
|
25
25
|
* min_terms - Always include multiword terms that comprise more than @min_terms words, default 2
|
26
26
|
* types - Extract proper nouns (:nnp) or nouns (:nn) or both (:all), default :all
|
27
27
|
* include_tags - Include the extracted POS tags in the results, default false
|
28
|
+
* collapse_terms - Remove shorter terms that are part of larger ones, default true
|
28
29
|
|
29
30
|
Sample usage:
|
30
31
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/lib/term-extract.rb
CHANGED
@@ -27,6 +27,8 @@ class TermExtract
|
|
27
27
|
@types = options.key?(:types) ? options.delete(:types) : :all
|
28
28
|
# Include the extracted POS tags in the results
|
29
29
|
@include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
|
30
|
+
# Remove shorter terms that are part of larger ones
|
31
|
+
@collapse_terms = options.key?(:collapse_terms) ? options.delete(:collapse_terms) : true
|
30
32
|
#@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
|
31
33
|
end
|
32
34
|
|
@@ -97,6 +99,16 @@ class TermExtract
|
|
97
99
|
terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
|
98
100
|
end
|
99
101
|
|
102
|
+
# Remove shorter terms that form part of larger terms
|
103
|
+
# This typically removes surname references when we already have a full name
|
104
|
+
if @collapse_terms
|
105
|
+
terms.each_key do |term1|
|
106
|
+
terms.each_key do |term2|
|
107
|
+
terms.delete(term2) if term1.length > term2.length && (term1 =~ /[^A-Za-z0-9]#{term2}$/ || term1 =~ /^#{term2}[^A-Za-z0-9]/)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
100
112
|
# Filter out tags unless required
|
101
113
|
unless @include_tags
|
102
114
|
terms.each_key { |term| terms[term] = terms[term][:occurances] }
|
data/term-extract.gemspec
CHANGED
data/test/test_term-extract.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'helper'
|
2
|
+
require 'pp'
|
2
3
|
|
3
4
|
class TestTermExtract < Test::Unit::TestCase
|
4
5
|
|
@@ -72,7 +73,7 @@ SOURCE
|
|
72
73
|
'Palestinians hope',
|
73
74
|
'Richard Makepeace',
|
74
75
|
'court order',
|
75
|
-
'event',
|
76
|
+
#'event',
|
76
77
|
'literature festival',
|
77
78
|
'peace accords',
|
78
79
|
'police notice',
|
@@ -108,17 +109,19 @@ SOURCE
|
|
108
109
|
assert terms.keys.include?("St Paul's Cathedral")
|
109
110
|
end
|
110
111
|
|
111
|
-
should "extract terms with
|
112
|
+
should "extract terms with prepositions" do
|
112
113
|
terms = @te.extract(@@DOC2)
|
113
114
|
assert terms.keys.include?("Secretary of State Owen Paterson")
|
114
115
|
end
|
115
116
|
|
116
|
-
should "extract terms
|
117
|
-
@te.
|
118
|
-
terms
|
119
|
-
|
120
|
-
|
121
|
-
|
117
|
+
should "extract terms with long prepositions" do
|
118
|
+
terms = @te.extract(@@DOC2)
|
119
|
+
assert terms.keys.include?("Chair of the Parades Commission for Northern Ireland")
|
120
|
+
end
|
121
|
+
|
122
|
+
should "collapse duplicate terms" do
|
123
|
+
terms = @te.extract(@@DOC2)
|
124
|
+
assert !terms.keys.include?("event")
|
122
125
|
end
|
123
126
|
|
124
127
|
should "extract common nouns when configured to" do
|
@@ -162,8 +165,9 @@ SOURCE
|
|
162
165
|
|
163
166
|
should "include pos tags in the results" do
|
164
167
|
terms = @te.extract(@@DOCUMENT)
|
165
|
-
|
166
|
-
assert terms[
|
168
|
+
term = terms.keys.first
|
169
|
+
assert terms[term].key?(:tag)
|
170
|
+
assert terms[term][:tag]
|
167
171
|
end
|
168
172
|
|
169
173
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: term-extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 4
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.4.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- rattle
|