term-extract 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +5 -0
- data/README.markdown +57 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/term-extract.rb +148 -0
- data/term-extract.gemspec +69 -0
- data/test/helper.rb +18 -0
- data/test/test_term-extract.rb +174 -0
- metadata +168 -0
data/.document
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
gem 'rbtagger', ">=0"
|
7
|
+
|
8
|
+
# Add dependencies to develop your gem here.
|
9
|
+
# Include everything needed to run rake, tests, features, etc.
|
10
|
+
group :development do
|
11
|
+
gem "shoulda", ">= 0"
|
12
|
+
gem "bundler", "~> 1.0.0"
|
13
|
+
gem "jeweler", "~> 1.5.2"
|
14
|
+
gem "rcov", ">= 0"
|
15
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
git (1.2.5)
|
5
|
+
jeweler (1.5.2)
|
6
|
+
bundler (~> 1.0.0)
|
7
|
+
git (>= 1.2.5)
|
8
|
+
rake
|
9
|
+
rake (0.8.7)
|
10
|
+
rbtagger (0.4.6)
|
11
|
+
rcov (0.9.9)
|
12
|
+
shoulda (2.11.3)
|
13
|
+
|
14
|
+
PLATFORMS
|
15
|
+
ruby
|
16
|
+
|
17
|
+
DEPENDENCIES
|
18
|
+
bundler (~> 1.0.0)
|
19
|
+
jeweler (~> 1.5.2)
|
20
|
+
rbtagger
|
21
|
+
rcov
|
22
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,5 @@
|
|
1
|
+
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
2
|
+
|
3
|
+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
4
|
+
|
5
|
+
You should have received a copy of the GNU General Public License along with this program. If not, see <www.gnu.org/licenses/>
|
data/README.markdown
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# term_extract - Term Extract
|
2
|
+
|
3
|
+
## Description:
|
4
|
+
|
5
|
+
term_extract extracts proper nouns (named things like 'Manchester United') and ordinary nouns (like 'event') from text documents.
|
6
|
+
|
7
|
+
## Usage:
|
8
|
+
|
9
|
+
An example extracting terms from a piece of content:
|
10
|
+
|
11
|
+
require 'term_extract'
|
12
|
+
|
13
|
+
content = <<DOC
|
14
|
+
Business Secretary Vince Cable will stay in cabinet despite
|
15
|
+
"declaring war" on Rupert Murdoch, says Downing Street.
|
16
|
+
DOC
|
17
|
+
|
18
|
+
terms = TermExtract.extract(content)
|
19
|
+
|
20
|
+
## Options
|
21
|
+
|
22
|
+
The #extract method takes an (optional) options hash, that allows the term extractor behaviour to be modified. The following options are available:
|
23
|
+
|
24
|
+
* min_occurance - The minimum number of times a single word term must occur to be included in the results, default 3
|
25
|
+
* min_terms - Always include multiword terms that comprise more than @min_terms words, default 2
|
26
|
+
* types - Extract proper nouns (:nnp) or nouns (:nn) or both (:all), default :all
|
27
|
+
* include_tags - Include the extracted POS tags in the results, default false
|
28
|
+
|
29
|
+
Sample usage:
|
30
|
+
|
31
|
+
terms = TermExtract.extract(content, :types => :nnp, :include_tags => true)
|
32
|
+
|
33
|
+
## Term Extraction Types
|
34
|
+
|
35
|
+
By default, the term extractor attempts to extract both ordinary nouns and proper nouns, this behaviour can be configured using the #types option and specifying :all (for both), :nn (for ordinary nouns) or :nnp (for proper nouns). These codes correspond to the relevent POS tags used during the term extraction process. Sample usage is shown below:
|
36
|
+
|
37
|
+
terms = TermExtract.extract(content, :types => :nnp)
|
38
|
+
|
39
|
+
## Note on Patches/Pull Requests
|
40
|
+
|
41
|
+
* Fork the project.
|
42
|
+
* Make your feature addition or bug fix.
|
43
|
+
* Add tests for it. This is important so I don't break it in a future version unintentionally.
|
44
|
+
* Commit, do not mess with Rakefile, version, or history as it's handled by Jeweler.
|
45
|
+
* Send me a pull request. I may or may not accept it.
|
46
|
+
|
47
|
+
## Acknowledgements
|
48
|
+
|
49
|
+
The algorithm and extraction code is based on the original python code at:
|
50
|
+
|
51
|
+
http://pypi.python.org/pypi/topia.termextract/
|
52
|
+
|
53
|
+
## Copyright and License
|
54
|
+
|
55
|
+
GPL v3 - See LICENSE.txt for details.
|
56
|
+
Copyright (c) 2010, Rob Lee
|
57
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "term-extract"
|
16
|
+
gem.homepage = "http://github.com/rattle/term-extract"
|
17
|
+
gem.license = "GPLv3"
|
18
|
+
gem.summary = %Q{Provides term extraction functionality}
|
19
|
+
gem.email = "robl@rjlee.net"
|
20
|
+
gem.authors = ["rattle"]
|
21
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
22
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
23
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
24
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
25
|
+
gem.add_dependency('rbtagger', '>= 0.0.0')
|
26
|
+
end
|
27
|
+
Jeweler::RubygemsDotOrgTasks.new
|
28
|
+
|
29
|
+
require 'rake/testtask'
|
30
|
+
Rake::TestTask.new(:test) do |test|
|
31
|
+
test.libs << 'lib' << 'test'
|
32
|
+
test.pattern = 'test/**/test_*.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
|
36
|
+
require 'rcov/rcovtask'
|
37
|
+
Rcov::RcovTask.new do |test|
|
38
|
+
test.libs << 'test'
|
39
|
+
test.pattern = 'test/**/test_*.rb'
|
40
|
+
test.verbose = true
|
41
|
+
end
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "term-extract #{version}"
|
51
|
+
rdoc.rdoc_files.include('README*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.0
|
data/lib/term-extract.rb
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
require 'rbtagger'
|
2
|
+
|
3
|
+
# Based on :
|
4
|
+
# http://pypi.python.org/pypi/topia.termextract/
|
5
|
+
|
6
|
+
class TermExtract
|
7
|
+
|
8
|
+
@@SEARCH=0
|
9
|
+
@@NOUN=1
|
10
|
+
|
11
|
+
@@TAGGER = Brill::Tagger.new
|
12
|
+
|
13
|
+
attr_accessor :min_occurance, :min_terms, :types, :include_tags, :lazy
|
14
|
+
|
15
|
+
# Provide a class method for syntactic sugar
|
16
|
+
def self.extract(content, options = {})
|
17
|
+
te = new(options)
|
18
|
+
te.extract(content)
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(options = {})
|
22
|
+
# The minimum number of times a single word term must occur to be included in the results
|
23
|
+
@min_occurance = options.key?(:min_occurance) ? options.delete(:min_occurance) : 3
|
24
|
+
# Always include multiword terms that comprise more than @min_terms words
|
25
|
+
@min_terms = options.key?(:min_terms) ? options.delete(:min_terms) : 2
|
26
|
+
# Extract proper nouns (:nnp) or nouns (:nn) or both (:all)
|
27
|
+
@types = options.key?(:types) ? options.delete(:types) : :all
|
28
|
+
# Include the extracted POS tags in the results
|
29
|
+
@include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
|
30
|
+
#@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
|
31
|
+
end
|
32
|
+
|
33
|
+
def extract(content)
|
34
|
+
|
35
|
+
tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
|
36
|
+
|
37
|
+
# Tidy content punctuation
|
38
|
+
# Add a space after periods
|
39
|
+
content.gsub!(/([A-Za-z0-9])\./, '\1. ')
|
40
|
+
# Add in full stops to tag list to allow multiterms to work
|
41
|
+
tags = []
|
42
|
+
tagger.tag(content).each do |tag|
|
43
|
+
if tag[0] =~ /\.$/
|
44
|
+
tag[0].chop!
|
45
|
+
tags.push tag
|
46
|
+
tags.push ['.', '.']
|
47
|
+
else
|
48
|
+
tags.push tag
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Set pos tags that identify nouns
|
53
|
+
pos = "^NN"
|
54
|
+
case @types
|
55
|
+
when :nn
|
56
|
+
pos = "^(NN|NNS)$"
|
57
|
+
when :nnp
|
58
|
+
pos = "^(NNP|NNPS)$"
|
59
|
+
end
|
60
|
+
|
61
|
+
terms = Hash.new()
|
62
|
+
multiterm = []
|
63
|
+
last_tag = ''
|
64
|
+
state = @@SEARCH
|
65
|
+
|
66
|
+
# Iterate through term list and identify nouns
|
67
|
+
tags.each do |term,tag|
|
68
|
+
|
69
|
+
if state == @@SEARCH and tag =~ /#{pos}/
|
70
|
+
# In search mode, found a noun
|
71
|
+
state = @@NOUN
|
72
|
+
add_term(term, tag, multiterm, terms)
|
73
|
+
elsif state == @@SEARCH and tag == 'JJ' and term =~ /^[A-Z]/ #and @lazy
|
74
|
+
# Allow things like 'Good' at the start of sentences
|
75
|
+
state = @@NOUN
|
76
|
+
add_term(term, tag, multiterm, terms)
|
77
|
+
elsif state == @@NOUN and tag == 'POS'
|
78
|
+
# Allow nouns with apostrophes : St Paul's Cathedral
|
79
|
+
multiterm << [term,tag]
|
80
|
+
elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
|
81
|
+
# Allow preposition : "Secretary of State"
|
82
|
+
# Doesn't support "Chair of the Parades Commission"
|
83
|
+
# Only use when in NNP mode
|
84
|
+
multiterm << [term,tag]
|
85
|
+
elsif state == @@NOUN and tag =~ /#{pos}/
|
86
|
+
# In noun mode, found a noun, add a multiterm noun
|
87
|
+
add_term(term, tag, multiterm, terms)
|
88
|
+
elsif state == @@NOUN and tag !=~ /#{pos}/
|
89
|
+
# In noun mode, found a non-noun, do we have a possible multiterm ?
|
90
|
+
state = @@SEARCH
|
91
|
+
add_multiterm(multiterm, terms) if multiterm.length > 1
|
92
|
+
multiterm = []
|
93
|
+
end
|
94
|
+
last_tag = tag
|
95
|
+
end
|
96
|
+
|
97
|
+
# Check the last term wasn't a possible multiterm
|
98
|
+
add_multiterm(multiterm, terms) if last_tag =~ /#{pos}/
|
99
|
+
|
100
|
+
# Filter out terms that don't meet minimum requirements
|
101
|
+
# It's possible for a term with multiple words to be returned even if it doesn't
|
102
|
+
# meet the min_occurance requirements (as a multiterm noun is very likely to be
|
103
|
+
# correct)
|
104
|
+
terms.each_key do |term|
|
105
|
+
occur = terms[term][:occurances]
|
106
|
+
strength = term.split(/ /).length
|
107
|
+
terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
|
108
|
+
end
|
109
|
+
|
110
|
+
# Filter out tags unless required
|
111
|
+
unless @include_tags
|
112
|
+
terms.each_key { |term| terms[term] = terms[term][:occurances] }
|
113
|
+
end
|
114
|
+
terms
|
115
|
+
end
|
116
|
+
|
117
|
+
protected
|
118
|
+
def add_term(term, tag, multiterm, terms)
|
119
|
+
multiterm << ([term, tag])
|
120
|
+
increment_term(term, tag, terms)
|
121
|
+
end
|
122
|
+
|
123
|
+
def add_multiterm(multiterm, terms)
|
124
|
+
multiterm.each { |rec| terms[rec[0]][:occurances] -=1 if terms.key?(rec[0]) && terms[rec[0]][:occurances] > 0 }
|
125
|
+
word = ''
|
126
|
+
multiterm.each_with_index do |term, index|
|
127
|
+
if (multiterm[index] == multiterm.last && term[1] == 'POS')
|
128
|
+
# Don't add a final 's if it's the last term
|
129
|
+
else
|
130
|
+
# Don't require a space for POS type concats
|
131
|
+
word+= term[1] == 'POS' ? term[0] : " #{term[0]}"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
word.lstrip!
|
135
|
+
increment_term(word, 'NNP', terms)
|
136
|
+
end
|
137
|
+
|
138
|
+
def increment_term(term, tag, terms)
|
139
|
+
if terms.key?(term)
|
140
|
+
terms[term][:occurances] += 1
|
141
|
+
else
|
142
|
+
terms[term] = {}
|
143
|
+
terms[term][:occurances] = 1
|
144
|
+
end
|
145
|
+
terms[term][:tag] = tag
|
146
|
+
end
|
147
|
+
|
148
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{term-extract}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["rattle"]
|
12
|
+
s.date = %q{2010-12-23}
|
13
|
+
s.email = %q{robl@rjlee.net}
|
14
|
+
s.extra_rdoc_files = [
|
15
|
+
"LICENSE.txt",
|
16
|
+
"README.markdown"
|
17
|
+
]
|
18
|
+
s.files = [
|
19
|
+
".document",
|
20
|
+
"Gemfile",
|
21
|
+
"Gemfile.lock",
|
22
|
+
"LICENSE.txt",
|
23
|
+
"README.markdown",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"lib/term-extract.rb",
|
27
|
+
"term-extract.gemspec",
|
28
|
+
"test/helper.rb",
|
29
|
+
"test/test_term-extract.rb"
|
30
|
+
]
|
31
|
+
s.homepage = %q{http://github.com/rattle/term-extract}
|
32
|
+
s.licenses = ["GPLv3"]
|
33
|
+
s.require_paths = ["lib"]
|
34
|
+
s.rubygems_version = %q{1.3.7}
|
35
|
+
s.summary = %q{Provides term extraction functionality}
|
36
|
+
s.test_files = [
|
37
|
+
"test/helper.rb",
|
38
|
+
"test/test_term-extract.rb"
|
39
|
+
]
|
40
|
+
|
41
|
+
if s.respond_to? :specification_version then
|
42
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
43
|
+
s.specification_version = 3
|
44
|
+
|
45
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
46
|
+
s.add_runtime_dependency(%q<rbtagger>, [">= 0"])
|
47
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
48
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
49
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
50
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
51
|
+
s.add_runtime_dependency(%q<rbtagger>, [">= 0.0.0"])
|
52
|
+
else
|
53
|
+
s.add_dependency(%q<rbtagger>, [">= 0"])
|
54
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
55
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
56
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
57
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
58
|
+
s.add_dependency(%q<rbtagger>, [">= 0.0.0"])
|
59
|
+
end
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<rbtagger>, [">= 0"])
|
62
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
65
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
66
|
+
s.add_dependency(%q<rbtagger>, [">= 0.0.0"])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'term-extract'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestTermExtract < Test::Unit::TestCase
|
4
|
+
|
5
|
+
@@DOC1 = <<DOC1
|
6
|
+
The London Stock Exchange is a stock exchange located in London, United Kingdom.
|
7
|
+
Founded in 1801, it is one of the largest stock exchanges in the world, with many
|
8
|
+
overseas listings as well as British companies. The exchange is part of the
|
9
|
+
London Stock Exchange Group and so sometimes referred to by the ticker symbol
|
10
|
+
for the group, LSE. Its current premises are situated in Paternoster Square
|
11
|
+
close to St Paul's Cathedral in the City of London
|
12
|
+
DOC1
|
13
|
+
|
14
|
+
@@DOC2 = <<DOC2
|
15
|
+
Secretary of State Owen Paterson has appointed Peter Osborne as Chair of the
|
16
|
+
Parades Commission for Northern Ireland and six new Commission members.
|
17
|
+
DOC2
|
18
|
+
|
19
|
+
@@DOCUMENT = <<SOURCE
|
20
|
+
Police shut Palestinian theatre in Jerusalem.
|
21
|
+
|
22
|
+
Israeli police have shut down a Palestinian theatre in East Jerusalem.
|
23
|
+
|
24
|
+
The action, on Thursday, prevented the closing event of an international
|
25
|
+
literature festival from taking place.
|
26
|
+
|
27
|
+
Police said they were acting on a court order, issued after intelligence
|
28
|
+
indicated that the Palestinian Authority was involved in the event.
|
29
|
+
|
30
|
+
Israel has occupied East Jerusalem since 1967 and has annexed the
|
31
|
+
area. This is not recognised by the international community.
|
32
|
+
|
33
|
+
The British consul-general in Jerusalem , Richard Makepeace, was
|
34
|
+
attending the event.
|
35
|
+
|
36
|
+
"I think all lovers of literature would regard this as a very
|
37
|
+
regrettable moment and regrettable decision," he added.
|
38
|
+
|
39
|
+
Mr Makepeace said the festival's closing event would be reorganised to
|
40
|
+
take place at the British Council in Jerusalem.
|
41
|
+
|
42
|
+
The Israeli authorities often take action against events in East
|
43
|
+
Jerusalem they see as connected to the Palestinian Authority.
|
44
|
+
|
45
|
+
Saturday's opening event at the same theatre was also shut down.
|
46
|
+
|
47
|
+
A police notice said the closure was on the orders of Israel's internal
|
48
|
+
security minister on the grounds of a breach of interim peace accords
|
49
|
+
from the 1990s.
|
50
|
+
|
51
|
+
These laid the framework for talks on establishing a Palestinian state
|
52
|
+
alongside Israel, but left the status of Jerusalem to be determined by
|
53
|
+
further negotiation.
|
54
|
+
|
55
|
+
Israel has annexed East Jerusalem and declares it part of its eternal
|
56
|
+
capital.
|
57
|
+
|
58
|
+
Palestinians hope to establish their capital in the area.
|
59
|
+
SOURCE
|
60
|
+
|
61
|
+
@@TERMS = [
|
62
|
+
'British Council',
|
63
|
+
'British consul-general',
|
64
|
+
'East Jerusalem',
|
65
|
+
'Israel',
|
66
|
+
'Israeli authorities',
|
67
|
+
'Israeli police',
|
68
|
+
'Mr Makepeace',
|
69
|
+
'Palestinian Authority',
|
70
|
+
'Palestinian state',
|
71
|
+
'Palestinian theatre',
|
72
|
+
'Palestinians hope',
|
73
|
+
'Richard Makepeace',
|
74
|
+
'court order',
|
75
|
+
'event',
|
76
|
+
'literature festival',
|
77
|
+
'peace accords',
|
78
|
+
'police notice',
|
79
|
+
'security minister'
|
80
|
+
]
|
81
|
+
|
82
|
+
context "Without a default term extractor" do
|
83
|
+
|
84
|
+
should "extract terms from a document" do
|
85
|
+
terms = TermExtract.extract(@@DOCUMENT)
|
86
|
+
@@TERMS.each do |term|
|
87
|
+
assert terms.keys.include?(term), "#{term} not found"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
context "With a default term extractor" do
|
94
|
+
|
95
|
+
setup do
|
96
|
+
@te = TermExtract.new()
|
97
|
+
end
|
98
|
+
|
99
|
+
should "extract terms from a document" do
|
100
|
+
terms = @te.extract(@@DOCUMENT)
|
101
|
+
@@TERMS.each do |term|
|
102
|
+
assert terms.keys.include?(term), "#{term} not found"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
should "extract terms with apostrophes in" do
|
107
|
+
terms = @te.extract(@@DOC1)
|
108
|
+
assert terms.keys.include?("St Paul's Cathedral")
|
109
|
+
end
|
110
|
+
|
111
|
+
should "extract terms with joining words" do
|
112
|
+
terms = @te.extract(@@DOC2)
|
113
|
+
assert terms.keys.include?("Secretary of State Owen Paterson")
|
114
|
+
end
|
115
|
+
|
116
|
+
should "extract terms and include pos tags when configured to" do
|
117
|
+
@te.include_tags = true
|
118
|
+
terms = @te.extract(@@DOCUMENT)
|
119
|
+
term = terms.keys.first
|
120
|
+
assert terms[term].key?(:tag)
|
121
|
+
assert terms[term][:tag]
|
122
|
+
end
|
123
|
+
|
124
|
+
should "extract common nouns when configured to" do
|
125
|
+
@te.types = :nn
|
126
|
+
terms = @te.extract(@@DOCUMENT)
|
127
|
+
assert terms.length == 11
|
128
|
+
end
|
129
|
+
|
130
|
+
context "with min_occurance set to 2" do
|
131
|
+
|
132
|
+
setup do
|
133
|
+
@te.min_occurance=2
|
134
|
+
end
|
135
|
+
|
136
|
+
should "extract terms that occur equal to or more than min_occurance" do
|
137
|
+
terms = @te.extract(@@DOCUMENT)
|
138
|
+
assert terms.keys.include?("Police")
|
139
|
+
assert terms['Police'] == @te.min_occurance
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
143
|
+
|
144
|
+
context "with min_terms set to 3" do
|
145
|
+
|
146
|
+
setup do
|
147
|
+
@te.min_terms=3
|
148
|
+
end
|
149
|
+
|
150
|
+
should "extract terms that have the same number of words as min_terms" do
|
151
|
+
terms = @te.extract(@@DOCUMENT)
|
152
|
+
assert terms.keys.include?("Saturday's opening event")
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
context "with include_tags set to true" do
|
158
|
+
|
159
|
+
setup do
|
160
|
+
@te.include_tags=true
|
161
|
+
end
|
162
|
+
|
163
|
+
should "include pos tags in the results" do
|
164
|
+
terms = @te.extract(@@DOCUMENT)
|
165
|
+
assert terms.keys.include?("Jerusalem")
|
166
|
+
assert terms['Jerusalem'][:tag] == 'NNP'
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
|
metadata
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: term-extract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- rattle
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-12-23 00:00:00 +00:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
prerelease: false
|
23
|
+
name: rbtagger
|
24
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
requirement: *id001
|
34
|
+
type: :runtime
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
37
|
+
name: shoulda
|
38
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
requirement: *id002
|
48
|
+
type: :development
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
prerelease: false
|
51
|
+
name: bundler
|
52
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ~>
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 23
|
58
|
+
segments:
|
59
|
+
- 1
|
60
|
+
- 0
|
61
|
+
- 0
|
62
|
+
version: 1.0.0
|
63
|
+
requirement: *id003
|
64
|
+
type: :development
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
prerelease: false
|
67
|
+
name: jeweler
|
68
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ~>
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 7
|
74
|
+
segments:
|
75
|
+
- 1
|
76
|
+
- 5
|
77
|
+
- 2
|
78
|
+
version: 1.5.2
|
79
|
+
requirement: *id004
|
80
|
+
type: :development
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
prerelease: false
|
83
|
+
name: rcov
|
84
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
hash: 3
|
90
|
+
segments:
|
91
|
+
- 0
|
92
|
+
version: "0"
|
93
|
+
requirement: *id005
|
94
|
+
type: :development
|
95
|
+
- !ruby/object:Gem::Dependency
|
96
|
+
prerelease: false
|
97
|
+
name: rbtagger
|
98
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
hash: 31
|
104
|
+
segments:
|
105
|
+
- 0
|
106
|
+
- 0
|
107
|
+
- 0
|
108
|
+
version: 0.0.0
|
109
|
+
requirement: *id006
|
110
|
+
type: :runtime
|
111
|
+
description:
|
112
|
+
email: robl@rjlee.net
|
113
|
+
executables: []
|
114
|
+
|
115
|
+
extensions: []
|
116
|
+
|
117
|
+
extra_rdoc_files:
|
118
|
+
- LICENSE.txt
|
119
|
+
- README.markdown
|
120
|
+
files:
|
121
|
+
- .document
|
122
|
+
- Gemfile
|
123
|
+
- Gemfile.lock
|
124
|
+
- LICENSE.txt
|
125
|
+
- README.markdown
|
126
|
+
- Rakefile
|
127
|
+
- VERSION
|
128
|
+
- lib/term-extract.rb
|
129
|
+
- term-extract.gemspec
|
130
|
+
- test/helper.rb
|
131
|
+
- test/test_term-extract.rb
|
132
|
+
has_rdoc: true
|
133
|
+
homepage: http://github.com/rattle/term-extract
|
134
|
+
licenses:
|
135
|
+
- GPLv3
|
136
|
+
post_install_message:
|
137
|
+
rdoc_options: []
|
138
|
+
|
139
|
+
require_paths:
|
140
|
+
- lib
|
141
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
142
|
+
none: false
|
143
|
+
requirements:
|
144
|
+
- - ">="
|
145
|
+
- !ruby/object:Gem::Version
|
146
|
+
hash: 3
|
147
|
+
segments:
|
148
|
+
- 0
|
149
|
+
version: "0"
|
150
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
|
+
none: false
|
152
|
+
requirements:
|
153
|
+
- - ">="
|
154
|
+
- !ruby/object:Gem::Version
|
155
|
+
hash: 3
|
156
|
+
segments:
|
157
|
+
- 0
|
158
|
+
version: "0"
|
159
|
+
requirements: []
|
160
|
+
|
161
|
+
rubyforge_project:
|
162
|
+
rubygems_version: 1.3.7
|
163
|
+
signing_key:
|
164
|
+
specification_version: 3
|
165
|
+
summary: Provides term extraction functionality
|
166
|
+
test_files:
|
167
|
+
- test/helper.rb
|
168
|
+
- test/test_term-extract.rb
|