DRMacIver-term-extractor 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +25 -0
- data/README.markdown +40 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bin/terms.rb +8 -0
- data/lib/term-extractor.rb +195 -0
- data/lib/term-extractor/maxent-2.5.2.jar +0 -0
- data/lib/term-extractor/nlp.rb +262 -0
- data/lib/term-extractor/opennlp-tools.jar +0 -0
- data/lib/term-extractor/snowball.jar +0 -0
- data/lib/term-extractor/trove.jar +0 -0
- data/licenses/Maxent +421 -0
- data/licenses/OpenNLP +421 -0
- data/licenses/Trove +504 -0
- data/licenses/snowball.php +33 -0
- data/models/chunk.bin.gz +0 -0
- data/models/sd.bin.gz +0 -0
- data/models/stopwords +567 -0
- data/models/tag.bin.gz +0 -0
- data/models/tagdict +16204 -0
- data/models/tok.bin.gz +0 -0
- data/term-extractor.gemspec +66 -0
- data/test/examples_spec.rb +131 -0
- data/test/files/1.email +37 -0
- data/test/files/juries_seg_8_v1 +20 -0
- data/test/nlp_spec.rb +231 -0
- data/test/term_extractor_spec.rb +141 -0
- metadata +83 -0
@@ -0,0 +1,141 @@
|
|
1
|
+
require "term-extractor"
|
2
|
+
require "rubygems"
|
3
|
+
require "rake"
|
4
|
+
|
5
|
+
PE = TermExtractor.new
|
6
|
+
|
7
|
+
def each_tag_in(file)
|
8
|
+
PE.extract_terms_from_text(IO.read(file)).each do |tag|
|
9
|
+
yield(tag)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def each_tag(&blk)
|
14
|
+
FileList["test/files/*"].each { |f| each_tag_in(f, &blk) }
|
15
|
+
end
|
16
|
+
|
17
|
+
describe TermExtractor do
|
18
|
+
it "should only return themes ending in nouns" do
|
19
|
+
each_tag do |tag|
|
20
|
+
tag.pos.should =~ /(^|-)(#{PE.required_ending})$/
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
it "must not return themes starting with proscribed parts of speech" do
|
25
|
+
each_tag do |tag|
|
26
|
+
tag.pos.should_not =~ /^(#{PE.proscribed_start})($|-)/
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should produce at least as many tags as words" do
|
31
|
+
each_tag do |tag|
|
32
|
+
tag.pos.split("-").length.should be >= tag.to_s.split.length
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should correctly identify the subterms of a known term" do
|
37
|
+
PE.extract_terms_from_text("I am a big fan of kitties").map{|x| x.to_s}.sort.should == ["big fan", "big fan of kitties", "kitties"]
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should allow terms ending in numerals" do
|
41
|
+
PE.extract_terms_from_text("I think Enterprise 2.0 is neato").map{|x| x.to_s}.sort.should == ["Enterprise 2.0"]
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should not concatenate words" do
|
45
|
+
internalconfig = <<PC
|
46
|
+
knowing their
|
47
|
+
internal network config
|
48
|
+
PC
|
49
|
+
|
50
|
+
(PE.extract_terms_from_text(internalconfig).join(" ") =~ /theirinternal/).should be(nil)
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should not concatenate words, even after ellipses" do
|
55
|
+
oukc = "Oracle University Knowledge Center... http://www.oracle.com/education/oln"
|
56
|
+
|
57
|
+
(PE.extract_terms_from_text(oukc).join(" ") =~ /Centerhttp/).should be(nil)
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should not split contractions" do
|
61
|
+
terms = PE.extract_terms_from_sentence("It is my considered opinion that Jon should've liked the puppies").map{|x| x.to_s }
|
62
|
+
|
63
|
+
terms.should_not include("ve")
|
64
|
+
terms.should_not include("ve liked the puppies")
|
65
|
+
end
|
66
|
+
|
67
|
+
it "shouldn 't leave spaces in terms containing contractions" do
|
68
|
+
terms = PE.extract_terms_from_sentence("Kittens aren't villains, they're cute").map{|x| x.to_s }
|
69
|
+
|
70
|
+
terms.should include("Kittens aren't villains")
|
71
|
+
terms.should_not include("Kittens aren 't villains")
|
72
|
+
end
|
73
|
+
|
74
|
+
def number_of_sentences(text, n)
|
75
|
+
counts = [0] * n
|
76
|
+
PE.extract_terms_from_text(text).each{|p| counts[p.sentence] += 1 }
|
77
|
+
counts.should_not include(0)
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should correctly attribute terms to sentences" do
|
81
|
+
number_of_sentences("I like kitties", 1)
|
82
|
+
number_of_sentences("I like kitties. They are cute creatures", 2)
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should not start terms with contractions" do
|
86
|
+
terms = PE.extract_terms_from_sentence("But I don't have time for such a drastic rewrite right now, I'm thinking it would take at least two weeks for someone who is experienced with Eclipe editors").map{|x| x.to_s}
|
87
|
+
|
88
|
+
terms.should_not include("don't have time")
|
89
|
+
end
|
90
|
+
|
91
|
+
it "should not produce terms which consist entirely of numbers" do
|
92
|
+
text = <<BINARYSOLO
|
93
|
+
Binary solo
|
94
|
+
0000001
|
95
|
+
00000011
|
96
|
+
0000001
|
97
|
+
00000011
|
98
|
+
0000001
|
99
|
+
0000001
|
100
|
+
0000001
|
101
|
+
0000001
|
102
|
+
BINARYSOLO
|
103
|
+
|
104
|
+
PE.extract_terms_from_text(text).each{|p| p.to_s.should_not match(/^[\s\d]*$/) }
|
105
|
+
end
|
106
|
+
|
107
|
+
it "should pick out interesting nouns which follow a possessive" do
|
108
|
+
PE.extract_terms_from_sentence("You know, you could always have asked me to change your password...").map{|x| x.to_s}.should include("password")
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should never generate stopwords" do
|
112
|
+
PE.extract_terms_from_sentence('A "Today Only" or "Sneak Preview" special tied to a specific day or time frame will encourage many recipients to open the message right away instead of passing it over for another one in the inbox.').map{|x| x.to_s}.should_not include("A")
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should never generate URLs" do
|
116
|
+
PE.extract_terms_from_text("I like http://www.google.com for searching").map{|t| t.to_s }.should_not include("http://www.google.com")
|
117
|
+
end
|
118
|
+
|
119
|
+
it "should not generate verb terms" do
|
120
|
+
PE.extract_terms_from_text("Do you think it makes sense to be the very model of a modern major general?").map{|t| t.to_s }.should_not include("makes sense")
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
it "should not allow verb terms internally" do
|
125
|
+
PE.extract_terms_from_text("Please consider the environment before printing this email").map{|t| t.to_s }.should_not include("environment before printing this email")
|
126
|
+
end
|
127
|
+
|
128
|
+
it "should not start terms with comparison adjectives" do
|
129
|
+
terms = PE.extract_terms_from_sentence("European policymakers urged the U.S. Senate on Wednesday to approve a revised $700 billion financial rescue plan aimed at tackling the worst financial crisis since the 1930s.").map{|t| t.to_s}
|
130
|
+
terms.should_not include("worst financial crisis")
|
131
|
+
terms.should include("financial crisis")
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
it "should not be confused by smart apostrophes" do
|
136
|
+
PE.extract_terms_from_sentence("By training I’m a mathematician, but I seem to have drifted away from that and become a programmer.").each { |term|
|
137
|
+
term.to_s.should_not =~ /’|'/
|
138
|
+
}
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: DRMacIver-term-extractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David R. MacIver
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-08-06 00:00:00 -07:00
|
13
|
+
default_executable: terms.rb
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: david.maciver@gmail.com
|
18
|
+
executables:
|
19
|
+
- terms.rb
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- LICENSE
|
24
|
+
- README.markdown
|
25
|
+
files:
|
26
|
+
- LICENSE
|
27
|
+
- README.markdown
|
28
|
+
- Rakefile
|
29
|
+
- VERSION
|
30
|
+
- bin/terms.rb
|
31
|
+
- lib/term-extractor.rb
|
32
|
+
- lib/term-extractor/maxent-2.5.2.jar
|
33
|
+
- lib/term-extractor/nlp.rb
|
34
|
+
- lib/term-extractor/opennlp-tools.jar
|
35
|
+
- lib/term-extractor/snowball.jar
|
36
|
+
- lib/term-extractor/trove.jar
|
37
|
+
- licenses/Maxent
|
38
|
+
- licenses/OpenNLP
|
39
|
+
- licenses/Trove
|
40
|
+
- licenses/snowball.php
|
41
|
+
- models/chunk.bin.gz
|
42
|
+
- models/sd.bin.gz
|
43
|
+
- models/stopwords
|
44
|
+
- models/tag.bin.gz
|
45
|
+
- models/tagdict
|
46
|
+
- models/tok.bin.gz
|
47
|
+
- term-extractor.gemspec
|
48
|
+
- test/examples_spec.rb
|
49
|
+
- test/files/1.email
|
50
|
+
- test/files/juries_seg_8_v1
|
51
|
+
- test/nlp_spec.rb
|
52
|
+
- test/term_extractor_spec.rb
|
53
|
+
has_rdoc: false
|
54
|
+
homepage: http://github.com/david.maciver@gmail.com/term-extractor
|
55
|
+
licenses:
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options:
|
58
|
+
- --charset=UTF-8
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: "0"
|
66
|
+
version:
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: "0"
|
72
|
+
version:
|
73
|
+
requirements: []
|
74
|
+
|
75
|
+
rubyforge_project:
|
76
|
+
rubygems_version: 1.3.5
|
77
|
+
signing_key:
|
78
|
+
specification_version: 3
|
79
|
+
summary: A library for extracting useful terms from text
|
80
|
+
test_files:
|
81
|
+
- test/term_extractor_spec.rb
|
82
|
+
- test/nlp_spec.rb
|
83
|
+
- test/examples_spec.rb
|