DRMacIver-term-extractor 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ require "term-extractor"
2
+ require "rubygems"
3
+ require "rake"
4
+
5
+ PE = TermExtractor.new
6
+
7
+ def each_tag_in(file)
8
+ PE.extract_terms_from_text(IO.read(file)).each do |tag|
9
+ yield(tag)
10
+ end
11
+ end
12
+
13
+ def each_tag(&blk)
14
+ FileList["test/files/*"].each { |f| each_tag_in(f, &blk) }
15
+ end
16
+
17
+ describe TermExtractor do
18
+ it "should only return themes ending in nouns" do
19
+ each_tag do |tag|
20
+ tag.pos.should =~ /(^|-)(#{PE.required_ending})$/
21
+ end
22
+ end
23
+
24
+ it "must not return themes starting with proscribed parts of speech" do
25
+ each_tag do |tag|
26
+ tag.pos.should_not =~ /^(#{PE.proscribed_start})($|-)/
27
+ end
28
+ end
29
+
30
+ it "should produce at least as many tags as words" do
31
+ each_tag do |tag|
32
+ tag.pos.split("-").length.should be >= tag.to_s.split.length
33
+ end
34
+ end
35
+
36
+ it "should correctly identify the subterms of a known term" do
37
+ PE.extract_terms_from_text("I am a big fan of kitties").map{|x| x.to_s}.sort.should == ["big fan", "big fan of kitties", "kitties"]
38
+ end
39
+
40
+ it "should allow terms ending in numerals" do
41
+ PE.extract_terms_from_text("I think Enterprise 2.0 is neato").map{|x| x.to_s}.sort.should == ["Enterprise 2.0"]
42
+ end
43
+
44
+ it "should not concatenate words" do
45
+ internalconfig = <<PC
46
+ knowing their
47
+ internal network config
48
+ PC
49
+
50
+ (PE.extract_terms_from_text(internalconfig).join(" ") =~ /theirinternal/).should be(nil)
51
+
52
+ end
53
+
54
+ it "should not concatenate words, even after ellipses" do
55
+ oukc = "Oracle University Knowledge Center... http://www.oracle.com/education/oln"
56
+
57
+ (PE.extract_terms_from_text(oukc).join(" ") =~ /Centerhttp/).should be(nil)
58
+ end
59
+
60
+ it "should not split contractions" do
61
+ terms = PE.extract_terms_from_sentence("It is my considered opinion that Jon should've liked the puppies").map{|x| x.to_s }
62
+
63
+ terms.should_not include("ve")
64
+ terms.should_not include("ve liked the puppies")
65
+ end
66
+
67
+ it "shouldn 't leave spaces in terms containing contractions" do
68
+ terms = PE.extract_terms_from_sentence("Kittens aren't villains, they're cute").map{|x| x.to_s }
69
+
70
+ terms.should include("Kittens aren't villains")
71
+ terms.should_not include("Kittens aren 't villains")
72
+ end
73
+
74
+ def number_of_sentences(text, n)
75
+ counts = [0] * n
76
+ PE.extract_terms_from_text(text).each{|p| counts[p.sentence] += 1 }
77
+ counts.should_not include(0)
78
+ end
79
+
80
+ it "should correctly attribute terms to sentences" do
81
+ number_of_sentences("I like kitties", 1)
82
+ number_of_sentences("I like kitties. They are cute creatures", 2)
83
+ end
84
+
85
+ it "should not start terms with contractions" do
86
+ terms = PE.extract_terms_from_sentence("But I don't have time for such a drastic rewrite right now, I'm thinking it would take at least two weeks for someone who is experienced with Eclipe editors").map{|x| x.to_s}
87
+
88
+ terms.should_not include("don't have time")
89
+ end
90
+
91
+ it "should not produce terms which consist entirely of numbers" do
92
+ text = <<BINARYSOLO
93
+ Binary solo
94
+ 0000001
95
+ 00000011
96
+ 0000001
97
+ 00000011
98
+ 0000001
99
+ 0000001
100
+ 0000001
101
+ 0000001
102
+ BINARYSOLO
103
+
104
+ PE.extract_terms_from_text(text).each{|p| p.to_s.should_not match(/^[\s\d]*$/) }
105
+ end
106
+
107
+ it "should pick out interesting nouns which follow a possessive" do
108
+ PE.extract_terms_from_sentence("You know, you could always have asked me to change your password...").map{|x| x.to_s}.should include("password")
109
+ end
110
+
111
+ it "should never generate stopwords" do
112
+ PE.extract_terms_from_sentence('A "Today Only" or "Sneak Preview" special tied to a specific day or time frame will encourage many recipients to open the message right away instead of passing it over for another one in the inbox.').map{|x| x.to_s}.should_not include("A")
113
+ end
114
+
115
+ it "should never generate URLs" do
116
+ PE.extract_terms_from_text("I like http://www.google.com for searching").map{|t| t.to_s }.should_not include("http://www.google.com")
117
+ end
118
+
119
+ it "should not generate verb terms" do
120
+ PE.extract_terms_from_text("Do you think it makes sense to be the very model of a modern major general?").map{|t| t.to_s }.should_not include("makes sense")
121
+ end
122
+
123
+
124
+ it "should not allow verb terms internally" do
125
+ PE.extract_terms_from_text("Please consider the environment before printing this email").map{|t| t.to_s }.should_not include("environment before printing this email")
126
+ end
127
+
128
+ it "should not start terms with comparison adjectives" do
129
+ terms = PE.extract_terms_from_sentence("European policymakers urged the U.S. Senate on Wednesday to approve a revised $700 billion financial rescue plan aimed at tackling the worst financial crisis since the 1930s.").map{|t| t.to_s}
130
+ terms.should_not include("worst financial crisis")
131
+ terms.should include("financial crisis")
132
+
133
+ end
134
+
135
+ it "should not be confused by smart apostrophes" do
136
+ PE.extract_terms_from_sentence("By training I’m a mathematician, but I seem to have drifted away from that and become a programmer.").each { |term|
137
+ term.to_s.should_not =~ /’|'/
138
+ }
139
+ end
140
+
141
+ end
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: DRMacIver-term-extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - David R. MacIver
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-06 00:00:00 -07:00
13
+ default_executable: terms.rb
14
+ dependencies: []
15
+
16
+ description:
17
+ email: david.maciver@gmail.com
18
+ executables:
19
+ - terms.rb
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.markdown
25
+ files:
26
+ - LICENSE
27
+ - README.markdown
28
+ - Rakefile
29
+ - VERSION
30
+ - bin/terms.rb
31
+ - lib/term-extractor.rb
32
+ - lib/term-extractor/maxent-2.5.2.jar
33
+ - lib/term-extractor/nlp.rb
34
+ - lib/term-extractor/opennlp-tools.jar
35
+ - lib/term-extractor/snowball.jar
36
+ - lib/term-extractor/trove.jar
37
+ - licenses/Maxent
38
+ - licenses/OpenNLP
39
+ - licenses/Trove
40
+ - licenses/snowball.php
41
+ - models/chunk.bin.gz
42
+ - models/sd.bin.gz
43
+ - models/stopwords
44
+ - models/tag.bin.gz
45
+ - models/tagdict
46
+ - models/tok.bin.gz
47
+ - term-extractor.gemspec
48
+ - test/examples_spec.rb
49
+ - test/files/1.email
50
+ - test/files/juries_seg_8_v1
51
+ - test/nlp_spec.rb
52
+ - test/term_extractor_spec.rb
53
+ has_rdoc: false
54
+ homepage: http://github.com/david.maciver@gmail.com/term-extractor
55
+ licenses:
56
+ post_install_message:
57
+ rdoc_options:
58
+ - --charset=UTF-8
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ version:
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ requirements: []
74
+
75
+ rubyforge_project:
76
+ rubygems_version: 1.3.5
77
+ signing_key:
78
+ specification_version: 3
79
+ summary: A library for extracting useful terms from text
80
+ test_files:
81
+ - test/term_extractor_spec.rb
82
+ - test/nlp_spec.rb
83
+ - test/examples_spec.rb