term-extract 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -37,6 +37,12 @@ By default, the term extractor attempts to extract both ordinary nouns and prope
37
37
 
38
38
  terms = TermExtract.extract(content, :types => :nnp)
39
39
 
40
+ ## Command Line Tool
41
+
42
+ There is a command line tool that can be used for testing the term extractor. It is best used in conjunction with another tool to extract the relevent content (e.g. pismo) :
43
+
44
+ pismo http://www.bbc.co.uk/news/uk-politics-12085506 body | ruby -rubygems -e 'puts YAML.parse($stdin.read)[:body].value' | ./term-extract nnp | ruby -rubygems -e 'puts YAML.load($stdin.read)'
45
+
40
46
  ## Note on Patches/Pull Requests
41
47
 
42
48
  * Fork the project.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.0
1
+ 0.5.0
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yaml'
4
+ require 'rubygems'
5
+ $:.unshift(File.dirname(__FILE__) + "/../lib")
6
+ require 'term-extract'
7
+
8
+ types = :nnp
9
+ types = ARGV.shift.to_sym if ARGV.length > 0
10
+
11
+ pp TermExtract.extract($stdin.readlines.join(" "), :types => types).to_yaml
@@ -74,7 +74,7 @@ class TermExtract
74
74
  # Allow preposition : "Secretary of State"
75
75
  # Only use when in NNP mode
76
76
  multiterm << [term,tag]
77
- elsif state == @@NOUN and tag =~ /#{pos}/
77
+ elsif state == @@NOUN and tag =~ /^NN/
78
78
  # In noun mode, found a noun, add a multiterm noun
79
79
  add_term(term, tag, multiterm, terms)
80
80
  elsif state == @@NOUN and tag !=~ /#{pos}/
@@ -96,11 +96,14 @@ class TermExtract
96
96
  terms.each_key do |term|
97
97
  occur = terms[term][:occurances]
98
98
  strength = term.split(/ /).length
99
+ terms.delete(term) if occur < 1
99
100
  terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
100
101
  end
101
102
 
102
103
  # Remove shorter terms that form part of larger terms
103
104
  # This typically removes surname references when we already have a full name
105
+ # This doesn't test that the larger term has more occurrences than the smaller
106
+ # term as testing has shown issues with this approach
104
107
  if @collapse_terms
105
108
  terms.each_key do |term1|
106
109
  terms.each_key do |term2|
@@ -151,12 +154,16 @@ class TermExtract
151
154
  multiterm.each_with_index do |term, index|
152
155
  if (multiterm[index] == multiterm.last && term[1] == 'POS')
153
156
  # Don't add a final 's if it's the last term
157
+ elsif (multiterm[index] == multiterm.last && term[1] == 'IN' ||
158
+ multiterm[index] == multiterm.last && term[1] == 'JJ')
159
+ # Don't add a final preposition if it's the last term
154
160
  else
155
161
  # Don't require a space for POS type concats
156
162
  word+= term[1] == 'POS' ? term[0] : " #{term[0]}"
157
163
  end
158
164
  end
159
165
  word.lstrip!
166
+ # Add the term
160
167
  increment_term(word, 'NNP', terms)
161
168
  end
162
169
 
@@ -5,12 +5,14 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{term-extract}
8
- s.version = "0.4.0"
8
+ s.version = "0.5.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["rattle"]
12
- s.date = %q{2010-12-24}
12
+ s.date = %q{2010-12-29}
13
+ s.default_executable = %q{term-extract}
13
14
  s.email = %q{robl@rjlee.net}
15
+ s.executables = ["term-extract"]
14
16
  s.extra_rdoc_files = [
15
17
  "LICENSE.txt",
16
18
  "README.markdown"
@@ -23,6 +25,7 @@ Gem::Specification.new do |s|
23
25
  "README.markdown",
24
26
  "Rakefile",
25
27
  "VERSION",
28
+ "bin/term-extract",
26
29
  "lib/term-extract.rb",
27
30
  "term-extract.gemspec",
28
31
  "test/helper.rb",
@@ -1,5 +1,4 @@
1
1
  require 'helper'
2
- require 'pp'
3
2
 
4
3
  class TestTermExtract < Test::Unit::TestCase
5
4
 
@@ -14,7 +13,10 @@ DOC1
14
13
 
15
14
  @@DOC2 = <<DOC2
16
15
  Secretary of State Owen Paterson has appointed Peter Osborne as Chair of the
17
- Parades Commission for Northern Ireland and six new Commission members.
16
+ Parades Commission for Northern Ireland and six new Commission members. Owen Paterson
17
+ said Osborne was an excellent choice for the position. Osborne was reported as
18
+ saying he was delighted at the appointment and thanked Owen Paterson for the honour.
19
+ Osborne then went on to outline his plans for the position.
18
20
  DOC2
19
21
 
20
22
  @@DOCUMENT = <<SOURCE
@@ -174,5 +176,4 @@ SOURCE
174
176
 
175
177
  end
176
178
 
177
- end
178
-
179
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: term-extract
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 11
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 4
8
+ - 5
9
9
  - 0
10
- version: 0.4.0
10
+ version: 0.5.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - rattle
@@ -15,8 +15,8 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-24 00:00:00 +00:00
19
- default_executable:
18
+ date: 2010-12-29 00:00:00 +00:00
19
+ default_executable: term-extract
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
22
22
  prerelease: false
@@ -94,8 +94,8 @@ dependencies:
94
94
  type: :development
95
95
  description:
96
96
  email: robl@rjlee.net
97
- executables: []
98
-
97
+ executables:
98
+ - term-extract
99
99
  extensions: []
100
100
 
101
101
  extra_rdoc_files:
@@ -109,6 +109,7 @@ files:
109
109
  - README.markdown
110
110
  - Rakefile
111
111
  - VERSION
112
+ - bin/term-extract
112
113
  - lib/term-extract.rb
113
114
  - term-extract.gemspec
114
115
  - test/helper.rb