DRMacIver-term-extractor 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/models/tok.bin.gz ADDED
Binary file
@@ -0,0 +1,66 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{term-extractor}
5
+ s.version = "0.0.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["David R. MacIver"]
9
+ s.date = %q{2009-08-06}
10
+ s.default_executable = %q{terms.rb}
11
+ s.email = %q{david.maciver@gmail.com}
12
+ s.executables = ["terms.rb"]
13
+ s.extra_rdoc_files = [
14
+ "LICENSE",
15
+ "README.markdown"
16
+ ]
17
+ s.files = [
18
+ "LICENSE",
19
+ "README.markdown",
20
+ "Rakefile",
21
+ "VERSION",
22
+ "bin/terms.rb",
23
+ "lib/term-extractor.rb",
24
+ "lib/term-extractor/maxent-2.5.2.jar",
25
+ "lib/term-extractor/nlp.rb",
26
+ "lib/term-extractor/opennlp-tools.jar",
27
+ "lib/term-extractor/snowball.jar",
28
+ "lib/term-extractor/trove.jar",
29
+ "licenses/Maxent",
30
+ "licenses/OpenNLP",
31
+ "licenses/Trove",
32
+ "licenses/snowball.php",
33
+ "models/chunk.bin.gz",
34
+ "models/sd.bin.gz",
35
+ "models/stopwords",
36
+ "models/tag.bin.gz",
37
+ "models/tagdict",
38
+ "models/tok.bin.gz",
39
+ "term-extractor.gemspec",
40
+ "test/examples_spec.rb",
41
+ "test/files/1.email",
42
+ "test/files/juries_seg_8_v1",
43
+ "test/nlp_spec.rb",
44
+ "test/term_extractor_spec.rb"
45
+ ]
46
+ s.homepage = %q{http://github.com/david.maciver@gmail.com/term-extractor}
47
+ s.rdoc_options = ["--charset=UTF-8"]
48
+ s.require_paths = ["lib"]
49
+ s.rubygems_version = %q{1.3.4}
50
+ s.summary = %q{A library for extracting useful terms from text}
51
+ s.test_files = [
52
+ "test/term_extractor_spec.rb",
53
+ "test/nlp_spec.rb",
54
+ "test/examples_spec.rb"
55
+ ]
56
+
57
+ if s.respond_to? :specification_version then
58
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
59
+ s.specification_version = 3
60
+
61
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
62
+ else
63
+ end
64
+ else
65
+ end
66
+ end
@@ -0,0 +1,131 @@
1
+ require "term-extractor"
2
+
3
+ PE = TermExtractor.new
4
+
5
+ Diagrams = <<DIAGRAMS
6
+ I think having nice standardised diagrams of stuff like that is REALLY
7
+ useful. One OO architect drops dead and your replacement walks in and
8
+ can pick up the documents and read them because they already speak
9
+ that language. That's a great thing. I sort of wish it had been pushed
10
+ as being that -- a lingua franca for documenting designs.
11
+ DIAGRAMS
12
+
13
+
14
+ describe "Diagram terms" do
15
+
16
+
17
+ end
18
+
19
+ Murray = <<MURRAY
20
+ The MCHS Department of Music is one of the most distinguished music programs in the State, having an award-winning choral and band program. The Marching Indians, under the direction of Mr. Mike Weaver, have performed all over the country, most recently at Universal Studios in Orlando, Disney World and the St. Patrick's Day Parade in New York City. Since 1958, the Marching Indians have been entreating fans with exciting, visually stimulating shows and their trademark deep, loud sound. Recently the Marching Indians received the Grand Championship at the 2008 Golden River Music Festival and won the first ever US101 radio battle of the bands receiving a concert by the Eli Young Band. Many students from MCHS Department of Bands have been involved with All District and All State bands as well as various summer clinics, orchestras and even the Georgia Lions All State Band.
21
+ MURRAY
22
+ MurrayTerms = PE.extract_terms_from_text(Murray).map{|x| x.to_s}
23
+
24
+ describe "Murray terms" do
25
+ it "should get Mike's name right" do
26
+ MurrayTerms.should_not include("Mr . Mike Weaver")
27
+ MurrayTerms.should include("Mr. Mike Weaver")
28
+ end
29
+ end
30
+
31
+ Chromosome = <<CHROM
32
+ Humans have 23 pairs of chromosomes packed with genes that dictate every aspect of our biological functioning. Of these pairs, the sex chromosomes are different; women have two X chromosomes and men have an X and a Y chromosome. The Y chromosome contains essential blueprints for the male reproductive system, in particular those for sperm development.
33
+
34
+ But the Y chromosome, which once contained as many genes as the X chromosome, has deteriorated over time and now contains less than 80 functional genes compared to its partner, which contains more than 1,000 genes. Geneticists and evolutionary biologists determined that the Y chromosome's deterioration is due to accumulated mutations, deletions and anomalies that have nowhere to go because the chromosome doesn't swap genes with the X chromosome like every other chromosomal pair in our cells do.
35
+ CHROM
36
+
37
+ ChromosomeTerms = PE.extract_terms_from_text(Chromosome).map{|x| x.to_s}
38
+
39
+ describe "Chromosome terms" do
40
+ it "should say nothing about what humans have" do
41
+ ChromosomeTerms.should_not include("Humans have 23 pairs")
42
+ end
43
+
44
+ it "knows about the male reproductive system, if you know what I mean" do
45
+ ChromosomeTerms.should include("male reproductive system")
46
+ ChromosomeTerms.should include("sperm development")
47
+ end
48
+
49
+ it "is about humans" do
50
+ ChromosomeTerms.should include("Humans")
51
+ end
52
+ end
53
+
54
+ Environment = "Please consider the environment before printing this e-mail"
55
+ EnvironmentTerms = PE.extract_terms_from_text(Environment).map{|x| x.to_s}.sort
56
+
57
+ describe "Environment terms" do
58
+ it "is about email" do
59
+ EnvironmentTerms.should include("e-mail")
60
+ end
61
+ end
62
+
63
+ Apollo = <<APOLLO
64
+ Fate has ordained that the men who went to the moon to explore in peace will stay on the moon to rest in peace.
65
+
66
+ These brave men, Neil Armstrong and Edwin Aldrin, know that there is no hope for their recovery. But they also know that there is hope for mankind in their sacrifice.
67
+
68
+ These two men are laying down their lives in mankind's most noble goal: the search for truth and understanding.
69
+
70
+ They will be mourned by their families and friends; they will be mourned by their nation; they will be mourned by the people of the world; they will be mourned by a Mother Earth that dared send two of her sons into the unknown.
71
+
72
+ In their exploration, they stirred the people of the world to feel as one; in their sacrifice, they bind more tightly the brotherhood of man.
73
+
74
+ In ancient days, men looked at stars and saw their heroes in the constellations. In modern times, we do much the same, but our heroes are epic men of flesh and blood.
75
+
76
+ Others will follow, and surely find their way home. Man's search will not be denied. But these men were the first, and they will remain the foremost in our hearts.
77
+ APOLLO
78
+
79
+ ApolloTerms = PE.extract_terms_from_text(Apollo).map{|x| x.to_s}.sort.uniq
80
+
81
+ describe "Apollo terms" do
82
+ it "knows of Neil and Buzz" do
83
+ ApolloTerms.should include("Neil Armstrong")
84
+ ApolloTerms.should include("Edwin Aldrin")
85
+ end
86
+
87
+ it "knows of where they've been" do
88
+ ApolloTerms.should include("moon")
89
+ end
90
+
91
+ it "knows of times past and present" do
92
+ ApolloTerms.should include("ancient days")
93
+ ApolloTerms.should include("modern times")
94
+ end
95
+
96
+ it "knows of destiny" do
97
+ ApolloTerms.should include("Fate")
98
+ end
99
+
100
+ it "knows of searching" do
101
+ ApolloTerms.should include("exploration")
102
+ ApolloTerms.should include("search")
103
+ ApolloTerms.should include("Man's search")
104
+ end
105
+
106
+ it "knows not of mourning, but of courage and sacrifice" do
107
+ ApolloTerms.should_not include("mourned")
108
+ ApolloTerms.should include("brave men")
109
+ ApolloTerms.should include("sacrifice")
110
+ end
111
+
112
+ it "knows of brotherhood" do
113
+ ApolloTerms.should include("brotherhood of man")
114
+ end
115
+
116
+ it "knows of mankind, and of its heroes" do
117
+ ApolloTerms.should include("man")
118
+ ApolloTerms.should include("men")
119
+ ApolloTerms.should include("mankind")
120
+ ApolloTerms.should include("heroes")
121
+ ApolloTerms.should include("epic men")
122
+ end
123
+
124
+ it "looks to the stars from the earth" do
125
+ ApolloTerms.should include("stars")
126
+ ApolloTerms.should include("constellations")
127
+ ApolloTerms.should include("Mother Earth")
128
+ ApolloTerms.should include("world")
129
+ end
130
+
131
+ end
@@ -0,0 +1,37 @@
1
+ what's the point of children if you can't cut down on rickshaw charges ?
2
+
3
+
4
+ On 8 Aug 2008, at 10:48, Roderick Parks wrote:
5
+
6
+ > Now there’s a horror movie in the making. You wake up in the morning
7
+ > and find a little wire growing out of your nose.....
8
+ >
9
+ > I was quite impressed by those tricycles, given that they have two
10
+ > seats each.
11
+ >
12
+ > It would seem that rickshaw drivers start their training at an early
13
+ > age :D
14
+ >
15
+ > ~R
16
+ >
17
+ > From: lab-bounces@trampolinesystems.com [mailto:lab-bounces@trampolinesystems.com
18
+ > ] On Behalf Of Peter Biddle
19
+ > Sent: 08 August 2008 10:32
20
+ > To: lab@trampolinesystems.com
21
+ > Subject: RE: The Case for Wireless
22
+ >
23
+ > It looks like cables just reached down and grabbed those tricycles.
24
+ >
25
+ > Procreation and then sentience cannot be far off…
26
+ >
27
+ > P
28
+ >
29
+ >
30
+ > _______________________________________________
31
+ > Lab mailing list
32
+ > Lab@trampolinesystems.com
33
+ > http://zimbra.trampolinesystems.com/mailman/listinfo/lab
34
+
35
+
36
+
37
+
@@ -0,0 +1,20 @@
1
+ OLATI JOHNSON [01:54:39]
2
+ THERE ARE LAWS IN THIS COUNTRY THAT WE HAVE ALL VOTED ON EITHER DIRECTLY OR THROUGH OUR REPRESENTATIVES. AND THE PROSECUTOR REPRESENTS THE PEOPLE OR THE UNITED STATES GOVERNMENT AND IS CHARGED WITH ENFORCING THOSE LAWS. AND SO THAT IS THE PROSECUTOR�S DUTY IS TO CHARGE PEOPLE AND BRING WORTHY CASES TO TRIAL WHEN PEOPLE HAVE VIOLATED THESE LAWS THAT WE�VE ALL AGREED SHOULD BE IN -- IN PLACE.
3
+ Here are some things prosecutors would like jurors to know.
4
+ JAMES TIERNEY [01:28:57]:
5
+ PROSECUTOR REALLY HAS TWO JOBS. ONE JOB A JUROR WILL SEE IN THE COURTROOM AND THAT IS TO MAKE THE ARGUMENT THAT THE PERSON WHO IS ACCUSED OF CRIME IS -- IS GUILTY AND SHOULD BE APPROPRIATELY PUNISHED. PROSECUTOR HAS ANOTHER IMPORTANT JOB, WHICH OCCURS OUTSIDE THE COURTROOM, AND THAT IS TO MAKE SURE THAT EVERYONE PLAYS BY THE RULES. // [03:51:18] BECAUSE IT'S NOT A PROSECUTOR'S JOB JUST TO SECURE GUILT. A PROSECUTOR HAS TO STAND UP FOR JUSTICE
6
+ PAUL RADVANY [4:40:15]
7
+ THE ROLE OF THE PROSECUTOR, UM, IN OUR CRIMINAL JUSTICE SYSTEM BEGINS WITH THE INVESTIGATION OFTEN. AND SO THEY WORK WITH LAW ENFORCEMENT, UM, TO TRY AND INVESTIGATE THE FACTS. AND DETERMINE WHETHER OR NOT SOMEONE SHOULD BE CHARGED WITH A CRIME.
8
+ So during the trial, just like the judge and the defense, the prosecution wants a juror�s undivided attention.
9
+ JAMES TIERNEY [4:24:40]
10
+ WHEN THE PROSECUTOR PRESENTS EVIDENCE, UH, HE OR SHE WANTS THE JURORS TO PAY ATTENTION AND TO LISTEN.
11
+ RADVANY [4:25:53]
12
+ AS YOU PRESENT EVIDENCE YOU WANNA MAKE SURE THAT THE JURORS ARE LISTENING VERY CAREFULLY. THAT THEY KEEP AN OPEN MIND. THAT THEY�RE JUDGING WITNESSES� CREDIBILITY. AND THAT, UM, ULTIMATELY THEY�LL BE DE-- DETERMINE, UM, INFERENCES THAT CAN BE DRAWN THROUGH THE -- FROM THE EVIDENCE.
13
+ Once jurors are ready to deliberate, prosecutors would hope that jurors have committed their full attention to the evidence presented during the trial.
14
+ JAMES TIERNEY [04:04:44]
15
+ SO THE FIRST THING A JUROR HAS TO DO IS LISTEN VERY, VERY CLOSELY TO THE JUDGE BEFORE THEY RETIRE// BUT, UH, EITHER A PROSECUTOR OR A DEFENSE COUNSEL I'M SURE WOULD WANT THE JURORS TO CAREFULLY GO OVER THE EVIDENCE, TO DISCUSS THE ISSUE AMONG THEMSELVES, IF THEY HAVE QUESTIONS, TO SUBMIT QUESTIONS BACK TO THE JUDGE DURING THE COURSE OF THEIR DELIBERATIONS, AND THEN TO WORK HARD TO COME UP WITH THE RIGHT ANSWER. IT'S -- IT'S IMPORTANT THAT THE JURORS FEEL COMFORTABLE WITH THEIR DECISION.
16
+ And just like the judge and the defense attorney, the prosecutor, would also like to remind jurors that every defendant must be presumed innocent until proven guilty.
17
+ JAMES TIERNEY [04:00:05]
18
+ OUR CONSTITUTION IS BASED ON A BILL OF RIGHTS, AND IT'S BASED ON -- ON A PRESUMPTION, THAT A GOVERNMENT SHOULD NOT DEPRIVE SOMEONE OF THEIR LIBERTY UNLESS THEY CAN TRULY PROVE WHAT, UH, WHAT OCCURRED, AND //ONE OF THE ELEMENTS OF THAT IS TO PRESUME THAT THAT PERSON WHO SITS THERE IN THE COURTROOM ON THE FIRST DAY OF TRIAL IS JUST AS INNOCENT AS ANY JUROR OR THE JUDGE OR THE PROSECUTOR OR DEFENSE COUNSEL. SO YOU PRESUME THE INNOCENCE, AND THEN IT'S UP TO THE STATE, THE GOVERNMENT T-- Y-- WORKING THROUGH THE PROSECUTOR TO PROVE THAT THAT IS NOT TRUE.
19
+ RADVANY [4:11:15]
20
+ THE ROLE OF THE PROSECUTOR IS VERY IMPORTANT IN SOCIETY OBVIOUSLY. // AND YOUR GOAL IS TO SEEK JUSTICE -- NOT NECESSARILY CONVICTIONS. BECAUSE YOU LEAVE IT UP TO THE JURY UNDER OUR SYSTEM TO DETERMINE WHETHER SOMEONE // IS GUILTY BEYOND A REASONABLE DOUBT.
data/test/nlp_spec.rb ADDED
@@ -0,0 +1,231 @@
1
+ require "term-extractor/nlp"
2
+ require "rubygems"
3
+ require "rake"
4
+
5
+ NLP = TermExtractor::NLP
6
+ MyNLP = NLP.new("#{File.dirname(__FILE__)}/../models")
7
+
8
+ def dont_split(text)
9
+ # This might not be quite right. We're currently allowing stripping punctuation
10
+ # from the beginning and end. Maybe we shouldn't?
11
+ NLP.tokenize_sentence(text).map{|x| x.to_s}.should == [text]
12
+ end
13
+
14
+
15
+ def one_sentence(text)
16
+ MyNLP.sentences(text).should == [text]
17
+ end
18
+
19
+ def n_sentences(n, text)
20
+ s = MyNLP.sentences(text)
21
+ s.should have_exactly(n).sentences
22
+ end
23
+
24
+ describe "sentence splitting" do
25
+ it "should not split sentences around URLs" do
26
+ one_sentence("If you go to http://www.google.com and type 'kitties' you will get lots of kitties")
27
+ end
28
+
29
+ it "should split sentences with abbreviations sensibly" do
30
+ one_sentence("Dr. Smith likes kitties")
31
+ end
32
+
33
+ it "should produce two sentences when there are line breaks" do
34
+
35
+ n_sentences 2, "Posting Date: November 8, 2008 \r\n Release Date: January, 1998\r\n \r\n Language: English"
36
+
37
+ n_sentences 2, <<KITTIES
38
+ I like kitties
39
+
40
+ I like puppies
41
+ KITTIES
42
+
43
+
44
+ end
45
+ end
46
+
47
+ describe "url removal" do
48
+ it "should replace URLs in the middle of sentences" do
49
+ NLP.remove_urls("I like the links you find at http://www.google.com when searching for kitties").should == "I like the links you find at <URL> when searching for kitties"
50
+ end
51
+
52
+ it "should replace URLs at the beginning of sentences" do
53
+ NLP.remove_urls("http://www.google.com is your number one source for kitties").should == "<URL> is your number one source for kitties"
54
+ end
55
+
56
+ it "should replace URLs at the end of sentences" do
57
+ NLP.remove_urls("When I want kitties I go to http://www.google.com").should == "When I want kitties I go to <URL>"
58
+ end
59
+
60
+ it "shuold replace URLs between sentences" do
61
+ NLP.remove_urls("The number one kitty finding service is http://www.google.com. Accept no substitutes").should == "The number one kitty finding service is <URL>. Accept no substitutes"
62
+ end
63
+ end
64
+
65
+ describe "path removal" do
66
+ it "should remove windows style paths" do
67
+ path ="C:\\Home\\Windows\\Nonsense\\Kitties is where windows people store kitty related material"
68
+
69
+ NLP.remove_paths(path).should == "<PATH> is where windows people store kitty related material"
70
+ end
71
+
72
+ it "should remove windows style paths with spaces in them" do
73
+ path = "C:\\Documents and Settings\\Kitty is the kitty's home directory"
74
+
75
+ NLP.remove_paths(path).should == "<PATH> is the kitty's home directory"
76
+ end
77
+
78
+ it "should remove unix style paths" do
79
+ NLP.remove_paths("/home/david/kitties is where *I* store kitty related material").should == "<PATH> is where *I* store kitty related material"
80
+ end
81
+ end
82
+
83
+ describe "extracting embedded terms" do
84
+ it "should replace quotes with <QUOTE>" do
85
+ quote = "\"I like kitties\", she declared"
86
+
87
+ main, embedded = NLP.extract_embedded_sentences(quote)
88
+
89
+ main.should == "<QUOTE>, she declared"
90
+ embedded.should == "I like kitties"
91
+ end
92
+
93
+
94
+ it "should replace parenthetical comments with an empty string" do
95
+ main, embedded = NLP.extract_embedded_sentences("I like kitties (especially fuzzy ones)")
96
+
97
+ main.should == "I like kitties "
98
+ embedded.should == "especially fuzzy ones"
99
+ end
100
+
101
+ it "should correctly deal with multiple nested parenthetical comments" do
102
+ main, e1, e2 = NLP.extract_embedded_sentences("I like kitties (especially fuzzy ones (but the long haired ones are kinda ugly))")
103
+
104
+ main.should == "I like kitties "
105
+ e1.should == "but the long haired ones are kinda ugly"
106
+ e2.should == "especially fuzzy ones "
107
+ end
108
+
109
+ it "should correctly deal with multiple non nested parenthetical comments" do
110
+ main, e1, e2 = NLP.extract_embedded_sentences("I like kitties (especially fuzzy ones)(but the long haired ones are kinda ugly)")
111
+
112
+ main.should == "I like kitties "
113
+ e1.should == "especially fuzzy ones"
114
+ e2.should == "but the long haired ones are kinda ugly"
115
+ end
116
+
117
+ it "should not extract a subterm when it is not matched" do
118
+ NLP.extract_embedded_sentences("She declared \" I like kitties").should have(1).fragment
119
+ end
120
+
121
+ it "should not extract a subterm when it would have to span multiple lines to do so " do
122
+ kitties = <<KITTIES
123
+ I like kitties (they are
124
+ the best)
125
+ KITTIES
126
+
127
+ NLP.extract_embedded_sentences(kitties).should == [kitties]
128
+ end
129
+
130
+ end
131
+
132
+ describe "tokenization" do
133
+
134
+ it "should not split up URLs" do
135
+ dont_split("http://www.theonion.com/content/news/female_serial_killer_has_to_work")
136
+ end
137
+
138
+ it "should not split up URLs with -s in them" do
139
+ dont_split("http://www.amazon.com/Fierce-Conversations-Acheiving-Success-Conversation/dp/0670031240")
140
+ end
141
+
142
+ it "should not split up emails" do
143
+ dont_split("david.maciver@trampolinesystems.com")
144
+ end
145
+
146
+ it "should split up contractions" do
147
+ NLP.tokenize_sentence("I'm the very model of a modern major general").should == ["I", "'m", "the", "very", "model", "of", "a", "modern", "major", "general"]
148
+ end
149
+
150
+ it "should split sentences around ellipses" do
151
+ NLP.tokenize_sentence("I like kitties...puppies are ok too").should == ["I", "like", "kitties", ",", "puppies", "are", "ok", "too"]
152
+ end
153
+
154
+ it "shouldn't split paths containing .." do
155
+ dont_split("/home/david/cute/puppies/../kitties/pictures")
156
+ end
157
+
158
+ it "should pull a sentence terminator into its own token" do
159
+ NLP.tokenize_sentence("You don't like kitties?!?")[-1].should == "?!?"
160
+ end
161
+
162
+ it "should detach punctuation as a separate token" do
163
+ NLP.tokenize_sentence("babies... the other white meat")[1].should == "..."
164
+ end
165
+
166
+ def dont_produce_token(text, term)
167
+ tokens = NLP.tokenize_sentence(text)
168
+ tokens.should_not include(term)
169
+ end
170
+
171
+ it "should not split numbers around commas" do
172
+ dont_produce_token("the reasons for selecting opengl rather than prefuse were to visualise >10,000 nodes and do 3d", "000")
173
+ end
174
+
175
+ it "should pull commas off the ends of tokens" do
176
+ dont_produce_token("kitties, puppies and birdies are all cute", "kitties,")
177
+ end
178
+
179
+ end
180
+
181
+ describe "cleaning" do
182
+ it "should remove stars trailing or leading a word" do
183
+ NLP.clean_sentence("Should that really take 5 minutes *over a network*").should == "Should that really take 5 minutes , over a network"
184
+ end
185
+
186
+
187
+ it "should turn quotes into commas" do
188
+ NLP.clean_sentence("I read \"Why kitties are cute\" over the summer").should == "I read , Why kitties are cute , over the summer"
189
+ end
190
+
191
+ it "should remove all new lines" do
192
+ (NLP.clean_sentence("
193
+ This sentence
194
+ has lots of
195
+ line breaks
196
+ in it
197
+ ") =~ /\n|\./).should == nil
198
+ end
199
+ end
200
+
201
+ def equate(foo, bar)
202
+ MyNLP.canonicalize(foo).should == MyNLP.canonicalize(bar)
203
+ end
204
+
205
+ describe "canonicalization" do
206
+ it "should identify plurals" do
207
+ equate("kitties", "kitty")
208
+ end
209
+
210
+ it "should identify strings that differ only in non alphanumeric characters" do
211
+ equate("foo/bar/baz", "foo bar baz")
212
+ end
213
+
214
+ it "should be insensitive to order" do
215
+ equate("foo bar baz", "bar foo baz")
216
+ end
217
+
218
+ it "should ignore stopwords" do
219
+ equate("programming in java", "java programming")
220
+ end
221
+ end
222
+
223
+ describe "stopword detection" do
224
+ it "should mark a as a stopword" do
225
+ MyNLP.stopword?("a").should be(true)
226
+ end
227
+
228
+ it "should not be fooled by capitalisation" do
229
+ MyNLP.stopword?("A").should be(true)
230
+ end
231
+ end