term-extract 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +6 -0
- data/VERSION +1 -1
- data/bin/term-extract +11 -0
- data/lib/term-extract.rb +8 -1
- data/term-extract.gemspec +5 -2
- data/test/test_term-extract.rb +5 -4
- metadata +8 -7
data/README.markdown
CHANGED
@@ -37,6 +37,12 @@ By default, the term extractor attempts to extract both ordinary nouns and prope
|
|
37
37
|
|
38
38
|
terms = TermExtract.extract(content, :types => :nnp)
|
39
39
|
|
40
|
+
## Command Line Tool
|
41
|
+
|
42
|
+
There is a command line tool that can be used for testing the term extractor. It is best used in conjunction with another tool to extract the relevent content (e.g. pismo) :
|
43
|
+
|
44
|
+
pismo http://www.bbc.co.uk/news/uk-politics-12085506 body | ruby -rubygems -e 'puts YAML.parse($stdin.read)[:body].value' | ./term-extract nnp | ruby -rubygems -e 'puts YAML.load($stdin.read)'
|
45
|
+
|
40
46
|
## Note on Patches/Pull Requests
|
41
47
|
|
42
48
|
* Fork the project.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
data/bin/term-extract
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'rubygems'
|
5
|
+
$:.unshift(File.dirname(__FILE__) + "/../lib")
|
6
|
+
require 'term-extract'
|
7
|
+
|
8
|
+
types = :nnp
|
9
|
+
types = ARGV.shift.to_sym if ARGV.length > 0
|
10
|
+
|
11
|
+
pp TermExtract.extract($stdin.readlines.join(" "), :types => types).to_yaml
|
data/lib/term-extract.rb
CHANGED
@@ -74,7 +74,7 @@ class TermExtract
|
|
74
74
|
# Allow preposition : "Secretary of State"
|
75
75
|
# Only use when in NNP mode
|
76
76
|
multiterm << [term,tag]
|
77
|
-
elsif state == @@NOUN and tag =~
|
77
|
+
elsif state == @@NOUN and tag =~ /^NN/
|
78
78
|
# In noun mode, found a noun, add a multiterm noun
|
79
79
|
add_term(term, tag, multiterm, terms)
|
80
80
|
elsif state == @@NOUN and tag !=~ /#{pos}/
|
@@ -96,11 +96,14 @@ class TermExtract
|
|
96
96
|
terms.each_key do |term|
|
97
97
|
occur = terms[term][:occurances]
|
98
98
|
strength = term.split(/ /).length
|
99
|
+
terms.delete(term) if occur < 1
|
99
100
|
terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
|
100
101
|
end
|
101
102
|
|
102
103
|
# Remove shorter terms that form part of larger terms
|
103
104
|
# This typically removes surname references when we already have a full name
|
105
|
+
# This doesn't test that the larger term has more occurrences than the smaller
|
106
|
+
# term as testing has shown issues with this approach
|
104
107
|
if @collapse_terms
|
105
108
|
terms.each_key do |term1|
|
106
109
|
terms.each_key do |term2|
|
@@ -151,12 +154,16 @@ class TermExtract
|
|
151
154
|
multiterm.each_with_index do |term, index|
|
152
155
|
if (multiterm[index] == multiterm.last && term[1] == 'POS')
|
153
156
|
# Don't add a final 's if it's the last term
|
157
|
+
elsif (multiterm[index] == multiterm.last && term[1] == 'IN' ||
|
158
|
+
multiterm[index] == multiterm.last && term[1] == 'JJ')
|
159
|
+
# Don't add a final preposition if it's the last term
|
154
160
|
else
|
155
161
|
# Don't require a space for POS type concats
|
156
162
|
word+= term[1] == 'POS' ? term[0] : " #{term[0]}"
|
157
163
|
end
|
158
164
|
end
|
159
165
|
word.lstrip!
|
166
|
+
# Add the term
|
160
167
|
increment_term(word, 'NNP', terms)
|
161
168
|
end
|
162
169
|
|
data/term-extract.gemspec
CHANGED
@@ -5,12 +5,14 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{term-extract}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.5.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["rattle"]
|
12
|
-
s.date = %q{2010-12-
|
12
|
+
s.date = %q{2010-12-29}
|
13
|
+
s.default_executable = %q{term-extract}
|
13
14
|
s.email = %q{robl@rjlee.net}
|
15
|
+
s.executables = ["term-extract"]
|
14
16
|
s.extra_rdoc_files = [
|
15
17
|
"LICENSE.txt",
|
16
18
|
"README.markdown"
|
@@ -23,6 +25,7 @@ Gem::Specification.new do |s|
|
|
23
25
|
"README.markdown",
|
24
26
|
"Rakefile",
|
25
27
|
"VERSION",
|
28
|
+
"bin/term-extract",
|
26
29
|
"lib/term-extract.rb",
|
27
30
|
"term-extract.gemspec",
|
28
31
|
"test/helper.rb",
|
data/test/test_term-extract.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'helper'
|
2
|
-
require 'pp'
|
3
2
|
|
4
3
|
class TestTermExtract < Test::Unit::TestCase
|
5
4
|
|
@@ -14,7 +13,10 @@ DOC1
|
|
14
13
|
|
15
14
|
@@DOC2 = <<DOC2
|
16
15
|
Secretary of State Owen Paterson has appointed Peter Osborne as Chair of the
|
17
|
-
Parades Commission for Northern Ireland and six new Commission members.
|
16
|
+
Parades Commission for Northern Ireland and six new Commission members. Owen Paterson
|
17
|
+
said Osborne was an excellent choice for the position. Osborne was reported as
|
18
|
+
saying he was delighted at the appointment and thanked Owen Paterson for the honour.
|
19
|
+
Osborne then went on to outline his plans for the position.
|
18
20
|
DOC2
|
19
21
|
|
20
22
|
@@DOCUMENT = <<SOURCE
|
@@ -174,5 +176,4 @@ SOURCE
|
|
174
176
|
|
175
177
|
end
|
176
178
|
|
177
|
-
end
|
178
|
-
|
179
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: term-extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 11
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 5
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.5.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- rattle
|
@@ -15,8 +15,8 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
19
|
-
default_executable:
|
18
|
+
date: 2010-12-29 00:00:00 +00:00
|
19
|
+
default_executable: term-extract
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
22
|
prerelease: false
|
@@ -94,8 +94,8 @@ dependencies:
|
|
94
94
|
type: :development
|
95
95
|
description:
|
96
96
|
email: robl@rjlee.net
|
97
|
-
executables:
|
98
|
-
|
97
|
+
executables:
|
98
|
+
- term-extract
|
99
99
|
extensions: []
|
100
100
|
|
101
101
|
extra_rdoc_files:
|
@@ -109,6 +109,7 @@ files:
|
|
109
109
|
- README.markdown
|
110
110
|
- Rakefile
|
111
111
|
- VERSION
|
112
|
+
- bin/term-extract
|
112
113
|
- lib/term-extract.rb
|
113
114
|
- term-extract.gemspec
|
114
115
|
- test/helper.rb
|