term-extract 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,6 +37,12 @@ By default, the term extractor attempts to extract both ordinary nouns and prope
37
37
 
38
38
  terms = TermExtract.extract(content, :types => :nnp)
39
39
 
40
+ ## Command Line Tool
41
+
42
+ There is a command line tool that can be used for testing the term extractor. It is best used in conjunction with another tool to extract the relevent content (e.g. pismo) :
43
+
44
+ pismo http://www.bbc.co.uk/news/uk-politics-12085506 body | ruby -rubygems -e 'puts YAML.parse($stdin.read)[:body].value' | ./term-extract nnp | ruby -rubygems -e 'puts YAML.load($stdin.read)'
45
+
40
46
  ## Note on Patches/Pull Requests
41
47
 
42
48
  * Fork the project.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.0
1
+ 0.5.0
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yaml'
4
+ require 'rubygems'
5
+ $:.unshift(File.dirname(__FILE__) + "/../lib")
6
+ require 'term-extract'
7
+
8
+ types = :nnp
9
+ types = ARGV.shift.to_sym if ARGV.length > 0
10
+
11
+ pp TermExtract.extract($stdin.readlines.join(" "), :types => types).to_yaml
@@ -74,7 +74,7 @@ class TermExtract
74
74
  # Allow preposition : "Secretary of State"
75
75
  # Only use when in NNP mode
76
76
  multiterm << [term,tag]
77
- elsif state == @@NOUN and tag =~ /#{pos}/
77
+ elsif state == @@NOUN and tag =~ /^NN/
78
78
  # In noun mode, found a noun, add a multiterm noun
79
79
  add_term(term, tag, multiterm, terms)
80
80
  elsif state == @@NOUN and tag !=~ /#{pos}/
@@ -96,11 +96,14 @@ class TermExtract
96
96
  terms.each_key do |term|
97
97
  occur = terms[term][:occurances]
98
98
  strength = term.split(/ /).length
99
+ terms.delete(term) if occur < 1
99
100
  terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
100
101
  end
101
102
 
102
103
  # Remove shorter terms that form part of larger terms
103
104
  # This typically removes surname references when we already have a full name
105
+ # This doesn't test that the larger term has more occurrences than the smaller
106
+ # term as testing has shown issues with this approach
104
107
  if @collapse_terms
105
108
  terms.each_key do |term1|
106
109
  terms.each_key do |term2|
@@ -151,12 +154,16 @@ class TermExtract
151
154
  multiterm.each_with_index do |term, index|
152
155
  if (multiterm[index] == multiterm.last && term[1] == 'POS')
153
156
  # Don't add a final 's if it's the last term
157
+ elsif (multiterm[index] == multiterm.last && term[1] == 'IN' ||
158
+ multiterm[index] == multiterm.last && term[1] == 'JJ')
159
+ # Don't add a final preposition if it's the last term
154
160
  else
155
161
  # Don't require a space for POS type concats
156
162
  word+= term[1] == 'POS' ? term[0] : " #{term[0]}"
157
163
  end
158
164
  end
159
165
  word.lstrip!
166
+ # Add the term
160
167
  increment_term(word, 'NNP', terms)
161
168
  end
162
169
 
@@ -5,12 +5,14 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{term-extract}
8
- s.version = "0.4.0"
8
+ s.version = "0.5.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["rattle"]
12
- s.date = %q{2010-12-24}
12
+ s.date = %q{2010-12-29}
13
+ s.default_executable = %q{term-extract}
13
14
  s.email = %q{robl@rjlee.net}
15
+ s.executables = ["term-extract"]
14
16
  s.extra_rdoc_files = [
15
17
  "LICENSE.txt",
16
18
  "README.markdown"
@@ -23,6 +25,7 @@ Gem::Specification.new do |s|
23
25
  "README.markdown",
24
26
  "Rakefile",
25
27
  "VERSION",
28
+ "bin/term-extract",
26
29
  "lib/term-extract.rb",
27
30
  "term-extract.gemspec",
28
31
  "test/helper.rb",
@@ -1,5 +1,4 @@
1
1
  require 'helper'
2
- require 'pp'
3
2
 
4
3
  class TestTermExtract < Test::Unit::TestCase
5
4
 
@@ -14,7 +13,10 @@ DOC1
14
13
 
15
14
  @@DOC2 = <<DOC2
16
15
  Secretary of State Owen Paterson has appointed Peter Osborne as Chair of the
17
- Parades Commission for Northern Ireland and six new Commission members.
16
+ Parades Commission for Northern Ireland and six new Commission members. Owen Paterson
17
+ said Osborne was an excellent choice for the position. Osborne was reported as
18
+ saying he was delighted at the appointment and thanked Owen Paterson for the honour.
19
+ Osborne then went on to outline his plans for the position.
18
20
  DOC2
19
21
 
20
22
  @@DOCUMENT = <<SOURCE
@@ -174,5 +176,4 @@ SOURCE
174
176
 
175
177
  end
176
178
 
177
- end
178
-
179
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: term-extract
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 11
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 4
8
+ - 5
9
9
  - 0
10
- version: 0.4.0
10
+ version: 0.5.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - rattle
@@ -15,8 +15,8 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-24 00:00:00 +00:00
19
- default_executable:
18
+ date: 2010-12-29 00:00:00 +00:00
19
+ default_executable: term-extract
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
22
22
  prerelease: false
@@ -94,8 +94,8 @@ dependencies:
94
94
  type: :development
95
95
  description:
96
96
  email: robl@rjlee.net
97
- executables: []
98
-
97
+ executables:
98
+ - term-extract
99
99
  extensions: []
100
100
 
101
101
  extra_rdoc_files:
@@ -109,6 +109,7 @@ files:
109
109
  - README.markdown
110
110
  - Rakefile
111
111
  - VERSION
112
+ - bin/term-extract
112
113
  - lib/term-extract.rb
113
114
  - term-extract.gemspec
114
115
  - test/helper.rb