term-extract 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +6 -0
- data/VERSION +1 -1
- data/bin/term-extract +11 -0
- data/lib/term-extract.rb +8 -1
- data/term-extract.gemspec +5 -2
- data/test/test_term-extract.rb +5 -4
- metadata +8 -7
data/README.markdown
CHANGED
@@ -37,6 +37,12 @@ By default, the term extractor attempts to extract both ordinary nouns and prope
|
|
37
37
|
|
38
38
|
terms = TermExtract.extract(content, :types => :nnp)
|
39
39
|
|
40
|
+
## Command Line Tool
|
41
|
+
|
42
|
+
There is a command line tool that can be used for testing the term extractor. It is best used in conjunction with another tool to extract the relevent content (e.g. pismo) :
|
43
|
+
|
44
|
+
pismo http://www.bbc.co.uk/news/uk-politics-12085506 body | ruby -rubygems -e 'puts YAML.parse($stdin.read)[:body].value' | ./term-extract nnp | ruby -rubygems -e 'puts YAML.load($stdin.read)'
|
45
|
+
|
40
46
|
## Note on Patches/Pull Requests
|
41
47
|
|
42
48
|
* Fork the project.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
data/bin/term-extract
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'rubygems'
|
5
|
+
$:.unshift(File.dirname(__FILE__) + "/../lib")
|
6
|
+
require 'term-extract'
|
7
|
+
|
8
|
+
types = :nnp
|
9
|
+
types = ARGV.shift.to_sym if ARGV.length > 0
|
10
|
+
|
11
|
+
pp TermExtract.extract($stdin.readlines.join(" "), :types => types).to_yaml
|
data/lib/term-extract.rb
CHANGED
@@ -74,7 +74,7 @@ class TermExtract
|
|
74
74
|
# Allow preposition : "Secretary of State"
|
75
75
|
# Only use when in NNP mode
|
76
76
|
multiterm << [term,tag]
|
77
|
-
elsif state == @@NOUN and tag =~
|
77
|
+
elsif state == @@NOUN and tag =~ /^NN/
|
78
78
|
# In noun mode, found a noun, add a multiterm noun
|
79
79
|
add_term(term, tag, multiterm, terms)
|
80
80
|
elsif state == @@NOUN and tag !=~ /#{pos}/
|
@@ -96,11 +96,14 @@ class TermExtract
|
|
96
96
|
terms.each_key do |term|
|
97
97
|
occur = terms[term][:occurances]
|
98
98
|
strength = term.split(/ /).length
|
99
|
+
terms.delete(term) if occur < 1
|
99
100
|
terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
|
100
101
|
end
|
101
102
|
|
102
103
|
# Remove shorter terms that form part of larger terms
|
103
104
|
# This typically removes surname references when we already have a full name
|
105
|
+
# This doesn't test that the larger term has more occurrences than the smaller
|
106
|
+
# term as testing has shown issues with this approach
|
104
107
|
if @collapse_terms
|
105
108
|
terms.each_key do |term1|
|
106
109
|
terms.each_key do |term2|
|
@@ -151,12 +154,16 @@ class TermExtract
|
|
151
154
|
multiterm.each_with_index do |term, index|
|
152
155
|
if (multiterm[index] == multiterm.last && term[1] == 'POS')
|
153
156
|
# Don't add a final 's if it's the last term
|
157
|
+
elsif (multiterm[index] == multiterm.last && term[1] == 'IN' ||
|
158
|
+
multiterm[index] == multiterm.last && term[1] == 'JJ')
|
159
|
+
# Don't add a final preposition if it's the last term
|
154
160
|
else
|
155
161
|
# Don't require a space for POS type concats
|
156
162
|
word+= term[1] == 'POS' ? term[0] : " #{term[0]}"
|
157
163
|
end
|
158
164
|
end
|
159
165
|
word.lstrip!
|
166
|
+
# Add the term
|
160
167
|
increment_term(word, 'NNP', terms)
|
161
168
|
end
|
162
169
|
|
data/term-extract.gemspec
CHANGED
@@ -5,12 +5,14 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{term-extract}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.5.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["rattle"]
|
12
|
-
s.date = %q{2010-12-
|
12
|
+
s.date = %q{2010-12-29}
|
13
|
+
s.default_executable = %q{term-extract}
|
13
14
|
s.email = %q{robl@rjlee.net}
|
15
|
+
s.executables = ["term-extract"]
|
14
16
|
s.extra_rdoc_files = [
|
15
17
|
"LICENSE.txt",
|
16
18
|
"README.markdown"
|
@@ -23,6 +25,7 @@ Gem::Specification.new do |s|
|
|
23
25
|
"README.markdown",
|
24
26
|
"Rakefile",
|
25
27
|
"VERSION",
|
28
|
+
"bin/term-extract",
|
26
29
|
"lib/term-extract.rb",
|
27
30
|
"term-extract.gemspec",
|
28
31
|
"test/helper.rb",
|
data/test/test_term-extract.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'helper'
|
2
|
-
require 'pp'
|
3
2
|
|
4
3
|
class TestTermExtract < Test::Unit::TestCase
|
5
4
|
|
@@ -14,7 +13,10 @@ DOC1
|
|
14
13
|
|
15
14
|
@@DOC2 = <<DOC2
|
16
15
|
Secretary of State Owen Paterson has appointed Peter Osborne as Chair of the
|
17
|
-
Parades Commission for Northern Ireland and six new Commission members.
|
16
|
+
Parades Commission for Northern Ireland and six new Commission members. Owen Paterson
|
17
|
+
said Osborne was an excellent choice for the position. Osborne was reported as
|
18
|
+
saying he was delighted at the appointment and thanked Owen Paterson for the honour.
|
19
|
+
Osborne then went on to outline his plans for the position.
|
18
20
|
DOC2
|
19
21
|
|
20
22
|
@@DOCUMENT = <<SOURCE
|
@@ -174,5 +176,4 @@ SOURCE
|
|
174
176
|
|
175
177
|
end
|
176
178
|
|
177
|
-
end
|
178
|
-
|
179
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: term-extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 11
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 5
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.5.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- rattle
|
@@ -15,8 +15,8 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
19
|
-
default_executable:
|
18
|
+
date: 2010-12-29 00:00:00 +00:00
|
19
|
+
default_executable: term-extract
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
22
|
prerelease: false
|
@@ -94,8 +94,8 @@ dependencies:
|
|
94
94
|
type: :development
|
95
95
|
description:
|
96
96
|
email: robl@rjlee.net
|
97
|
-
executables:
|
98
|
-
|
97
|
+
executables:
|
98
|
+
- term-extract
|
99
99
|
extensions: []
|
100
100
|
|
101
101
|
extra_rdoc_files:
|
@@ -109,6 +109,7 @@ files:
|
|
109
109
|
- README.markdown
|
110
110
|
- Rakefile
|
111
111
|
- VERSION
|
112
|
+
- bin/term-extract
|
112
113
|
- lib/term-extract.rb
|
113
114
|
- term-extract.gemspec
|
114
115
|
- test/helper.rb
|