term-extract 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +1 -0
- data/VERSION +1 -1
- data/lib/term-extract.rb +12 -0
- data/term-extract.gemspec +1 -1
- data/test/test_term-extract.rb +14 -10
- metadata +3 -3
data/README.markdown
CHANGED
@@ -25,6 +25,7 @@ The #extract method takes an (optional) options hash, that allows the term extra
|
|
25
25
|
* min_terms - Always include multiword terms that comprise more than @min_terms words, default 2
|
26
26
|
* types - Extract proper nouns (:nnp) or nouns (:nn) or both (:all), default :all
|
27
27
|
* include_tags - Include the extracted POS tags in the results, default false
|
28
|
+
* collapse_terms - Remove shorter terms that are part of larger ones, default true
|
28
29
|
|
29
30
|
Sample usage:
|
30
31
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/lib/term-extract.rb
CHANGED
@@ -27,6 +27,8 @@ class TermExtract
|
|
27
27
|
@types = options.key?(:types) ? options.delete(:types) : :all
|
28
28
|
# Include the extracted POS tags in the results
|
29
29
|
@include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
|
30
|
+
# Remove shorter terms that are part of larger ones
|
31
|
+
@collapse_terms = options.key?(:collapse_terms) ? options.delete(:collapse_terms) : true
|
30
32
|
#@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
|
31
33
|
end
|
32
34
|
|
@@ -97,6 +99,16 @@ class TermExtract
|
|
97
99
|
terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
|
98
100
|
end
|
99
101
|
|
102
|
+
# Remove shorter terms that form part of larger terms
|
103
|
+
# This typically removes surname references when we already have a full name
|
104
|
+
if @collapse_terms
|
105
|
+
terms.each_key do |term1|
|
106
|
+
terms.each_key do |term2|
|
107
|
+
terms.delete(term2) if term1.length > term2.length && (term1 =~ /[^A-Za-z0-9]#{term2}$/ || term1 =~ /^#{term2}[^A-Za-z0-9]/)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
100
112
|
# Filter out tags unless required
|
101
113
|
unless @include_tags
|
102
114
|
terms.each_key { |term| terms[term] = terms[term][:occurances] }
|
data/term-extract.gemspec
CHANGED
data/test/test_term-extract.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'helper'
|
2
|
+
require 'pp'
|
2
3
|
|
3
4
|
class TestTermExtract < Test::Unit::TestCase
|
4
5
|
|
@@ -72,7 +73,7 @@ SOURCE
|
|
72
73
|
'Palestinians hope',
|
73
74
|
'Richard Makepeace',
|
74
75
|
'court order',
|
75
|
-
'event',
|
76
|
+
#'event',
|
76
77
|
'literature festival',
|
77
78
|
'peace accords',
|
78
79
|
'police notice',
|
@@ -108,17 +109,19 @@ SOURCE
|
|
108
109
|
assert terms.keys.include?("St Paul's Cathedral")
|
109
110
|
end
|
110
111
|
|
111
|
-
should "extract terms with
|
112
|
+
should "extract terms with prepositions" do
|
112
113
|
terms = @te.extract(@@DOC2)
|
113
114
|
assert terms.keys.include?("Secretary of State Owen Paterson")
|
114
115
|
end
|
115
116
|
|
116
|
-
should "extract terms
|
117
|
-
@te.
|
118
|
-
terms
|
119
|
-
|
120
|
-
|
121
|
-
|
117
|
+
should "extract terms with long prepositions" do
|
118
|
+
terms = @te.extract(@@DOC2)
|
119
|
+
assert terms.keys.include?("Chair of the Parades Commission for Northern Ireland")
|
120
|
+
end
|
121
|
+
|
122
|
+
should "collapse duplicate terms" do
|
123
|
+
terms = @te.extract(@@DOC2)
|
124
|
+
assert !terms.keys.include?("event")
|
122
125
|
end
|
123
126
|
|
124
127
|
should "extract common nouns when configured to" do
|
@@ -162,8 +165,9 @@ SOURCE
|
|
162
165
|
|
163
166
|
should "include pos tags in the results" do
|
164
167
|
terms = @te.extract(@@DOCUMENT)
|
165
|
-
|
166
|
-
assert terms[
|
168
|
+
term = terms.keys.first
|
169
|
+
assert terms[term].key?(:tag)
|
170
|
+
assert terms[term][:tag]
|
167
171
|
end
|
168
172
|
|
169
173
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: term-extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 4
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.4.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- rattle
|