DRMacIver-term-extractor 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +25 -0
- data/README.markdown +40 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bin/terms.rb +8 -0
- data/lib/term-extractor.rb +195 -0
- data/lib/term-extractor/maxent-2.5.2.jar +0 -0
- data/lib/term-extractor/nlp.rb +262 -0
- data/lib/term-extractor/opennlp-tools.jar +0 -0
- data/lib/term-extractor/snowball.jar +0 -0
- data/lib/term-extractor/trove.jar +0 -0
- data/licenses/Maxent +421 -0
- data/licenses/OpenNLP +421 -0
- data/licenses/Trove +504 -0
- data/licenses/snowball.php +33 -0
- data/models/chunk.bin.gz +0 -0
- data/models/sd.bin.gz +0 -0
- data/models/stopwords +567 -0
- data/models/tag.bin.gz +0 -0
- data/models/tagdict +16204 -0
- data/models/tok.bin.gz +0 -0
- data/term-extractor.gemspec +66 -0
- data/test/examples_spec.rb +131 -0
- data/test/files/1.email +37 -0
- data/test/files/juries_seg_8_v1 +20 -0
- data/test/nlp_spec.rb +231 -0
- data/test/term_extractor_spec.rb +141 -0
- metadata +83 -0
data/models/tok.bin.gz
ADDED
Binary file
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{term-extractor}
|
5
|
+
s.version = "0.0.0"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["David R. MacIver"]
|
9
|
+
s.date = %q{2009-08-06}
|
10
|
+
s.default_executable = %q{terms.rb}
|
11
|
+
s.email = %q{david.maciver@gmail.com}
|
12
|
+
s.executables = ["terms.rb"]
|
13
|
+
s.extra_rdoc_files = [
|
14
|
+
"LICENSE",
|
15
|
+
"README.markdown"
|
16
|
+
]
|
17
|
+
s.files = [
|
18
|
+
"LICENSE",
|
19
|
+
"README.markdown",
|
20
|
+
"Rakefile",
|
21
|
+
"VERSION",
|
22
|
+
"bin/terms.rb",
|
23
|
+
"lib/term-extractor.rb",
|
24
|
+
"lib/term-extractor/maxent-2.5.2.jar",
|
25
|
+
"lib/term-extractor/nlp.rb",
|
26
|
+
"lib/term-extractor/opennlp-tools.jar",
|
27
|
+
"lib/term-extractor/snowball.jar",
|
28
|
+
"lib/term-extractor/trove.jar",
|
29
|
+
"licenses/Maxent",
|
30
|
+
"licenses/OpenNLP",
|
31
|
+
"licenses/Trove",
|
32
|
+
"licenses/snowball.php",
|
33
|
+
"models/chunk.bin.gz",
|
34
|
+
"models/sd.bin.gz",
|
35
|
+
"models/stopwords",
|
36
|
+
"models/tag.bin.gz",
|
37
|
+
"models/tagdict",
|
38
|
+
"models/tok.bin.gz",
|
39
|
+
"term-extractor.gemspec",
|
40
|
+
"test/examples_spec.rb",
|
41
|
+
"test/files/1.email",
|
42
|
+
"test/files/juries_seg_8_v1",
|
43
|
+
"test/nlp_spec.rb",
|
44
|
+
"test/term_extractor_spec.rb"
|
45
|
+
]
|
46
|
+
s.homepage = %q{http://github.com/david.maciver@gmail.com/term-extractor}
|
47
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
48
|
+
s.require_paths = ["lib"]
|
49
|
+
s.rubygems_version = %q{1.3.4}
|
50
|
+
s.summary = %q{A library for extracting useful terms from text}
|
51
|
+
s.test_files = [
|
52
|
+
"test/term_extractor_spec.rb",
|
53
|
+
"test/nlp_spec.rb",
|
54
|
+
"test/examples_spec.rb"
|
55
|
+
]
|
56
|
+
|
57
|
+
if s.respond_to? :specification_version then
|
58
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
59
|
+
s.specification_version = 3
|
60
|
+
|
61
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
62
|
+
else
|
63
|
+
end
|
64
|
+
else
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require "term-extractor"
|
2
|
+
|
3
|
+
PE = TermExtractor.new
|
4
|
+
|
5
|
+
Diagrams = <<DIAGRAMS
|
6
|
+
I think having nice standardised diagrams of stuff like that is REALLY
|
7
|
+
useful. One OO architect drops dead and your replacement walks in and
|
8
|
+
can pick up the documents and read them because they already speak
|
9
|
+
that language. That's a great thing. I sort of wish it had been pushed
|
10
|
+
as being that -- a lingua franca for documenting designs.
|
11
|
+
DIAGRAMS
|
12
|
+
|
13
|
+
|
14
|
+
describe "Diagram terms" do
|
15
|
+
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
Murray = <<MURRAY
|
20
|
+
The MCHS Department of Music is one of the most distinguished music programs in the State, having an award-winning choral and band program. The Marching Indians, under the direction of Mr. Mike Weaver, have performed all over the country, most recently at Universal Studios in Orlando, Disney World and the St. Patrick's Day Parade in New York City. Since 1958, the Marching Indians have been entreating fans with exciting, visually stimulating shows and their trademark deep, loud sound. Recently the Marching Indians received the Grand Championship at the 2008 Golden River Music Festival and won the first ever US101 radio battle of the bands receiving a concert by the Eli Young Band. Many students from MCHS Department of Bands have been involved with All District and All State bands as well as various summer clinics, orchestras and even the Georgia Lions All State Band.
|
21
|
+
MURRAY
|
22
|
+
MurrayTerms = PE.extract_terms_from_text(Murray).map{|x| x.to_s}
|
23
|
+
|
24
|
+
describe "Murray terms" do
|
25
|
+
it "should get Mike's name right" do
|
26
|
+
MurrayTerms.should_not include("Mr . Mike Weaver")
|
27
|
+
MurrayTerms.should include("Mr. Mike Weaver")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
Chromosome = <<CHROM
|
32
|
+
Humans have 23 pairs of chromosomes packed with genes that dictate every aspect of our biological functioning. Of these pairs, the sex chromosomes are different; women have two X chromosomes and men have an X and a Y chromosome. The Y chromosome contains essential blueprints for the male reproductive system, in particular those for sperm development.
|
33
|
+
|
34
|
+
But the Y chromosome, which once contained as many genes as the X chromosome, has deteriorated over time and now contains less than 80 functional genes compared to its partner, which contains more than 1,000 genes. Geneticists and evolutionary biologists determined that the Y chromosome's deterioration is due to accumulated mutations, deletions and anomalies that have nowhere to go because the chromosome doesn't swap genes with the X chromosome like every other chromosomal pair in our cells do.
|
35
|
+
CHROM
|
36
|
+
|
37
|
+
ChromosomeTerms = PE.extract_terms_from_text(Chromosome).map{|x| x.to_s}
|
38
|
+
|
39
|
+
describe "Chromosome terms" do
|
40
|
+
it "should say nothing about what humans have" do
|
41
|
+
ChromosomeTerms.should_not include("Humans have 23 pairs")
|
42
|
+
end
|
43
|
+
|
44
|
+
it "knows about the male reproductive system, if you know what I mean" do
|
45
|
+
ChromosomeTerms.should include("male reproductive system")
|
46
|
+
ChromosomeTerms.should include("sperm development")
|
47
|
+
end
|
48
|
+
|
49
|
+
it "is about humans" do
|
50
|
+
ChromosomeTerms.should include("Humans")
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
Environment = "Please consider the environment before printing this e-mail"
|
55
|
+
EnvironmentTerms = PE.extract_terms_from_text(Environment).map{|x| x.to_s}.sort
|
56
|
+
|
57
|
+
describe "Environment terms" do
|
58
|
+
it "is about email" do
|
59
|
+
EnvironmentTerms.should include("e-mail")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
Apollo = <<APOLLO
|
64
|
+
Fate has ordained that the men who went to the moon to explore in peace will stay on the moon to rest in peace.
|
65
|
+
|
66
|
+
These brave men, Neil Armstrong and Edwin Aldrin, know that there is no hope for their recovery. But they also know that there is hope for mankind in their sacrifice.
|
67
|
+
|
68
|
+
These two men are laying down their lives in mankind's most noble goal: the search for truth and understanding.
|
69
|
+
|
70
|
+
They will be mourned by their families and friends; they will be mourned by their nation; they will be mourned by the people of the world; they will be mourned by a Mother Earth that dared send two of her sons into the unknown.
|
71
|
+
|
72
|
+
In their exploration, they stirred the people of the world to feel as one; in their sacrifice, they bind more tightly the brotherhood of man.
|
73
|
+
|
74
|
+
In ancient days, men looked at stars and saw their heroes in the constellations. In modern times, we do much the same, but our heroes are epic men of flesh and blood.
|
75
|
+
|
76
|
+
Others will follow, and surely find their way home. Man's search will not be denied. But these men were the first, and they will remain the foremost in our hearts.
|
77
|
+
APOLLO
|
78
|
+
|
79
|
+
ApolloTerms = PE.extract_terms_from_text(Apollo).map{|x| x.to_s}.sort.uniq
|
80
|
+
|
81
|
+
describe "Apollo terms" do
|
82
|
+
it "knows of Neil and Buzz" do
|
83
|
+
ApolloTerms.should include("Neil Armstrong")
|
84
|
+
ApolloTerms.should include("Edwin Aldrin")
|
85
|
+
end
|
86
|
+
|
87
|
+
it "knows of where they've been" do
|
88
|
+
ApolloTerms.should include("moon")
|
89
|
+
end
|
90
|
+
|
91
|
+
it "knows of times past and present" do
|
92
|
+
ApolloTerms.should include("ancient days")
|
93
|
+
ApolloTerms.should include("modern times")
|
94
|
+
end
|
95
|
+
|
96
|
+
it "knows of destiny" do
|
97
|
+
ApolloTerms.should include("Fate")
|
98
|
+
end
|
99
|
+
|
100
|
+
it "knows of searching" do
|
101
|
+
ApolloTerms.should include("exploration")
|
102
|
+
ApolloTerms.should include("search")
|
103
|
+
ApolloTerms.should include("Man's search")
|
104
|
+
end
|
105
|
+
|
106
|
+
it "knows not of mourning, but of courage and sacrifice" do
|
107
|
+
ApolloTerms.should_not include("mourned")
|
108
|
+
ApolloTerms.should include("brave men")
|
109
|
+
ApolloTerms.should include("sacrifice")
|
110
|
+
end
|
111
|
+
|
112
|
+
it "knows of brotherhood" do
|
113
|
+
ApolloTerms.should include("brotherhood of man")
|
114
|
+
end
|
115
|
+
|
116
|
+
it "knows of mankind, and of its heroes" do
|
117
|
+
ApolloTerms.should include("man")
|
118
|
+
ApolloTerms.should include("men")
|
119
|
+
ApolloTerms.should include("mankind")
|
120
|
+
ApolloTerms.should include("heroes")
|
121
|
+
ApolloTerms.should include("epic men")
|
122
|
+
end
|
123
|
+
|
124
|
+
it "looks to the stars from the earth" do
|
125
|
+
ApolloTerms.should include("stars")
|
126
|
+
ApolloTerms.should include("constellations")
|
127
|
+
ApolloTerms.should include("Mother Earth")
|
128
|
+
ApolloTerms.should include("world")
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
data/test/files/1.email
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
what's the point of children if you can't cut down on rickshaw charges ?
|
2
|
+
|
3
|
+
|
4
|
+
On 8 Aug 2008, at 10:48, Roderick Parks wrote:
|
5
|
+
|
6
|
+
> Now there’s a horror movie in the making. You wake up in the morning
|
7
|
+
> and find a little wire growing out of your nose.....
|
8
|
+
>
|
9
|
+
> I was quite impressed by those tricycles, given that they have two
|
10
|
+
> seats each.
|
11
|
+
>
|
12
|
+
> It would seem that rickshaw drivers start their training at an early
|
13
|
+
> age :D
|
14
|
+
>
|
15
|
+
> ~R
|
16
|
+
>
|
17
|
+
> From: lab-bounces@trampolinesystems.com [mailto:lab-bounces@trampolinesystems.com
|
18
|
+
> ] On Behalf Of Peter Biddle
|
19
|
+
> Sent: 08 August 2008 10:32
|
20
|
+
> To: lab@trampolinesystems.com
|
21
|
+
> Subject: RE: The Case for Wireless
|
22
|
+
>
|
23
|
+
> It looks like cables just reached down and grabbed those tricycles.
|
24
|
+
>
|
25
|
+
> Procreation and then sentience cannot be far off…
|
26
|
+
>
|
27
|
+
> P
|
28
|
+
>
|
29
|
+
>
|
30
|
+
> _______________________________________________
|
31
|
+
> Lab mailing list
|
32
|
+
> Lab@trampolinesystems.com
|
33
|
+
> http://zimbra.trampolinesystems.com/mailman/listinfo/lab
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
OLATI JOHNSON [01:54:39]
|
2
|
+
THERE ARE LAWS IN THIS COUNTRY THAT WE HAVE ALL VOTED ON EITHER DIRECTLY OR THROUGH OUR REPRESENTATIVES. AND THE PROSECUTOR REPRESENTS THE PEOPLE OR THE UNITED STATES GOVERNMENT AND IS CHARGED WITH ENFORCING THOSE LAWS. AND SO THAT IS THE PROSECUTOR�S DUTY IS TO CHARGE PEOPLE AND BRING WORTHY CASES TO TRIAL WHEN PEOPLE HAVE VIOLATED THESE LAWS THAT WE�VE ALL AGREED SHOULD BE IN -- IN PLACE.
|
3
|
+
Here are some things prosecutors would like jurors to know.
|
4
|
+
JAMES TIERNEY [01:28:57]:
|
5
|
+
PROSECUTOR REALLY HAS TWO JOBS. ONE JOB A JUROR WILL SEE IN THE COURTROOM AND THAT IS TO MAKE THE ARGUMENT THAT THE PERSON WHO IS ACCUSED OF CRIME IS -- IS GUILTY AND SHOULD BE APPROPRIATELY PUNISHED. PROSECUTOR HAS ANOTHER IMPORTANT JOB, WHICH OCCURS OUTSIDE THE COURTROOM, AND THAT IS TO MAKE SURE THAT EVERYONE PLAYS BY THE RULES. // [03:51:18] BECAUSE IT'S NOT A PROSECUTOR'S JOB JUST TO SECURE GUILT. A PROSECUTOR HAS TO STAND UP FOR JUSTICE
|
6
|
+
PAUL RADVANY [4:40:15]
|
7
|
+
THE ROLE OF THE PROSECUTOR, UM, IN OUR CRIMINAL JUSTICE SYSTEM BEGINS WITH THE INVESTIGATION OFTEN. AND SO THEY WORK WITH LAW ENFORCEMENT, UM, TO TRY AND INVESTIGATE THE FACTS. AND DETERMINE WHETHER OR NOT SOMEONE SHOULD BE CHARGED WITH A CRIME.
|
8
|
+
So during the trial, just like the judge and the defense, the prosecution wants a juror�s undivided attention.
|
9
|
+
JAMES TIERNEY [4:24:40]
|
10
|
+
WHEN THE PROSECUTOR PRESENTS EVIDENCE, UH, HE OR SHE WANTS THE JURORS TO PAY ATTENTION AND TO LISTEN.
|
11
|
+
RADVANY [4:25:53]
|
12
|
+
AS YOU PRESENT EVIDENCE YOU WANNA MAKE SURE THAT THE JURORS ARE LISTENING VERY CAREFULLY. THAT THEY KEEP AN OPEN MIND. THAT THEY�RE JUDGING WITNESSES� CREDIBILITY. AND THAT, UM, ULTIMATELY THEY�LL BE DE-- DETERMINE, UM, INFERENCES THAT CAN BE DRAWN THROUGH THE -- FROM THE EVIDENCE.
|
13
|
+
Once jurors are ready to deliberate, prosecutors would hope that jurors have committed their full attention to the evidence presented during the trial.
|
14
|
+
JAMES TIERNEY [04:04:44]
|
15
|
+
SO THE FIRST THING A JUROR HAS TO DO IS LISTEN VERY, VERY CLOSELY TO THE JUDGE BEFORE THEY RETIRE// BUT, UH, EITHER A PROSECUTOR OR A DEFENSE COUNSEL I'M SURE WOULD WANT THE JURORS TO CAREFULLY GO OVER THE EVIDENCE, TO DISCUSS THE ISSUE AMONG THEMSELVES, IF THEY HAVE QUESTIONS, TO SUBMIT QUESTIONS BACK TO THE JUDGE DURING THE COURSE OF THEIR DELIBERATIONS, AND THEN TO WORK HARD TO COME UP WITH THE RIGHT ANSWER. IT'S -- IT'S IMPORTANT THAT THE JURORS FEEL COMFORTABLE WITH THEIR DECISION.
|
16
|
+
And just like the judge and the defense attorney, the prosecutor, would also like to remind jurors that every defendant must be presumed innocent until proven guilty.
|
17
|
+
JAMES TIERNEY [04:00:05]
|
18
|
+
OUR CONSTITUTION IS BASED ON A BILL OF RIGHTS, AND IT'S BASED ON -- ON A PRESUMPTION, THAT A GOVERNMENT SHOULD NOT DEPRIVE SOMEONE OF THEIR LIBERTY UNLESS THEY CAN TRULY PROVE WHAT, UH, WHAT OCCURRED, AND //ONE OF THE ELEMENTS OF THAT IS TO PRESUME THAT THAT PERSON WHO SITS THERE IN THE COURTROOM ON THE FIRST DAY OF TRIAL IS JUST AS INNOCENT AS ANY JUROR OR THE JUDGE OR THE PROSECUTOR OR DEFENSE COUNSEL. SO YOU PRESUME THE INNOCENCE, AND THEN IT'S UP TO THE STATE, THE GOVERNMENT T-- Y-- WORKING THROUGH THE PROSECUTOR TO PROVE THAT THAT IS NOT TRUE.
|
19
|
+
RADVANY [4:11:15]
|
20
|
+
THE ROLE OF THE PROSECUTOR IS VERY IMPORTANT IN SOCIETY OBVIOUSLY. // AND YOUR GOAL IS TO SEEK JUSTICE -- NOT NECESSARILY CONVICTIONS. BECAUSE YOU LEAVE IT UP TO THE JURY UNDER OUR SYSTEM TO DETERMINE WHETHER SOMEONE // IS GUILTY BEYOND A REASONABLE DOUBT.
|
data/test/nlp_spec.rb
ADDED
@@ -0,0 +1,231 @@
|
|
1
|
+
require "term-extractor/nlp"
|
2
|
+
require "rubygems"
|
3
|
+
require "rake"
|
4
|
+
|
5
|
+
NLP = TermExtractor::NLP
|
6
|
+
MyNLP = NLP.new("#{File.dirname(__FILE__)}/../models")
|
7
|
+
|
8
|
+
def dont_split(text)
|
9
|
+
# This might not be quite right. We're currently allowing stripping punctuation
|
10
|
+
# from the beginning and end. Maybe we shouldn't?
|
11
|
+
NLP.tokenize_sentence(text).map{|x| x.to_s}.should == [text]
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def one_sentence(text)
|
16
|
+
MyNLP.sentences(text).should == [text]
|
17
|
+
end
|
18
|
+
|
19
|
+
def n_sentences(n, text)
|
20
|
+
s = MyNLP.sentences(text)
|
21
|
+
s.should have_exactly(n).sentences
|
22
|
+
end
|
23
|
+
|
24
|
+
describe "sentence splitting" do
|
25
|
+
it "should not split sentences around URLs" do
|
26
|
+
one_sentence("If you go to http://www.google.com and type 'kitties' you will get lots of kitties")
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should split sentences with abbreviations sensibly" do
|
30
|
+
one_sentence("Dr. Smith likes kitties")
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should produce two sentences when there are line breaks" do
|
34
|
+
|
35
|
+
n_sentences 2, "Posting Date: November 8, 2008 \r\n Release Date: January, 1998\r\n \r\n Language: English"
|
36
|
+
|
37
|
+
n_sentences 2, <<KITTIES
|
38
|
+
I like kitties
|
39
|
+
|
40
|
+
I like puppies
|
41
|
+
KITTIES
|
42
|
+
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe "url removal" do
|
48
|
+
it "should replace URLs in the middle of sentences" do
|
49
|
+
NLP.remove_urls("I like the links you find at http://www.google.com when searching for kitties").should == "I like the links you find at <URL> when searching for kitties"
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should replace URLs at the beginning of sentences" do
|
53
|
+
NLP.remove_urls("http://www.google.com is your number one source for kitties").should == "<URL> is your number one source for kitties"
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should replace URLs at the end of sentences" do
|
57
|
+
NLP.remove_urls("When I want kitties I go to http://www.google.com").should == "When I want kitties I go to <URL>"
|
58
|
+
end
|
59
|
+
|
60
|
+
it "shuold replace URLs between sentences" do
|
61
|
+
NLP.remove_urls("The number one kitty finding service is http://www.google.com. Accept no substitutes").should == "The number one kitty finding service is <URL>. Accept no substitutes"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "path removal" do
|
66
|
+
it "should remove windows style paths" do
|
67
|
+
path ="C:\\Home\\Windows\\Nonsense\\Kitties is where windows people store kitty related material"
|
68
|
+
|
69
|
+
NLP.remove_paths(path).should == "<PATH> is where windows people store kitty related material"
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should remove windows style paths with spaces in them" do
|
73
|
+
path = "C:\\Documents and Settings\\Kitty is the kitty's home directory"
|
74
|
+
|
75
|
+
NLP.remove_paths(path).should == "<PATH> is the kitty's home directory"
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should remove unix style paths" do
|
79
|
+
NLP.remove_paths("/home/david/kitties is where *I* store kitty related material").should == "<PATH> is where *I* store kitty related material"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe "extracting embedded terms" do
|
84
|
+
it "should replace quotes with <QUOTE>" do
|
85
|
+
quote = "\"I like kitties\", she declared"
|
86
|
+
|
87
|
+
main, embedded = NLP.extract_embedded_sentences(quote)
|
88
|
+
|
89
|
+
main.should == "<QUOTE>, she declared"
|
90
|
+
embedded.should == "I like kitties"
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
it "should replace parenthetical comments with an empty string" do
|
95
|
+
main, embedded = NLP.extract_embedded_sentences("I like kitties (especially fuzzy ones)")
|
96
|
+
|
97
|
+
main.should == "I like kitties "
|
98
|
+
embedded.should == "especially fuzzy ones"
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should correctly deal with multiple nested parenthetical comments" do
|
102
|
+
main, e1, e2 = NLP.extract_embedded_sentences("I like kitties (especially fuzzy ones (but the long haired ones are kinda ugly))")
|
103
|
+
|
104
|
+
main.should == "I like kitties "
|
105
|
+
e1.should == "but the long haired ones are kinda ugly"
|
106
|
+
e2.should == "especially fuzzy ones "
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should correctly deal with multiple non nested parenthetical comments" do
|
110
|
+
main, e1, e2 = NLP.extract_embedded_sentences("I like kitties (especially fuzzy ones)(but the long haired ones are kinda ugly)")
|
111
|
+
|
112
|
+
main.should == "I like kitties "
|
113
|
+
e1.should == "especially fuzzy ones"
|
114
|
+
e2.should == "but the long haired ones are kinda ugly"
|
115
|
+
end
|
116
|
+
|
117
|
+
it "should not extract a subterm when it is not matched" do
|
118
|
+
NLP.extract_embedded_sentences("She declared \" I like kitties").should have(1).fragment
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should not extract a subterm when it would have to span multiple lines to do so " do
|
122
|
+
kitties = <<KITTIES
|
123
|
+
I like kitties (they are
|
124
|
+
the best)
|
125
|
+
KITTIES
|
126
|
+
|
127
|
+
NLP.extract_embedded_sentences(kitties).should == [kitties]
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
|
132
|
+
describe "tokenization" do
|
133
|
+
|
134
|
+
it "should not split up URLs" do
|
135
|
+
dont_split("http://www.theonion.com/content/news/female_serial_killer_has_to_work")
|
136
|
+
end
|
137
|
+
|
138
|
+
it "should not split up URLs with -s in them" do
|
139
|
+
dont_split("http://www.amazon.com/Fierce-Conversations-Acheiving-Success-Conversation/dp/0670031240")
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should not split up emails" do
|
143
|
+
dont_split("david.maciver@trampolinesystems.com")
|
144
|
+
end
|
145
|
+
|
146
|
+
it "should split up contractions" do
|
147
|
+
NLP.tokenize_sentence("I'm the very model of a modern major general").should == ["I", "'m", "the", "very", "model", "of", "a", "modern", "major", "general"]
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should split sentences around ellipses" do
|
151
|
+
NLP.tokenize_sentence("I like kitties...puppies are ok too").should == ["I", "like", "kitties", ",", "puppies", "are", "ok", "too"]
|
152
|
+
end
|
153
|
+
|
154
|
+
it "shouldn't split paths containing .." do
|
155
|
+
dont_split("/home/david/cute/puppies/../kitties/pictures")
|
156
|
+
end
|
157
|
+
|
158
|
+
it "should pull a sentence terminator into its own token" do
|
159
|
+
NLP.tokenize_sentence("You don't like kitties?!?")[-1].should == "?!?"
|
160
|
+
end
|
161
|
+
|
162
|
+
it "should detach punctuation as a separate token" do
|
163
|
+
NLP.tokenize_sentence("babies... the other white meat")[1].should == "..."
|
164
|
+
end
|
165
|
+
|
166
|
+
def dont_produce_token(text, term)
|
167
|
+
tokens = NLP.tokenize_sentence(text)
|
168
|
+
tokens.should_not include(term)
|
169
|
+
end
|
170
|
+
|
171
|
+
it "should not split numbers around commas" do
|
172
|
+
dont_produce_token("the reasons for selecting opengl rather than prefuse were to visualise >10,000 nodes and do 3d", "000")
|
173
|
+
end
|
174
|
+
|
175
|
+
it "should pull commas off the ends of tokens" do
|
176
|
+
dont_produce_token("kitties, puppies and birdies are all cute", "kitties,")
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
describe "cleaning" do
|
182
|
+
it "should remove stars trailing or leading a word" do
|
183
|
+
NLP.clean_sentence("Should that really take 5 minutes *over a network*").should == "Should that really take 5 minutes , over a network"
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
it "should turn quotes into commas" do
|
188
|
+
NLP.clean_sentence("I read \"Why kitties are cute\" over the summer").should == "I read , Why kitties are cute , over the summer"
|
189
|
+
end
|
190
|
+
|
191
|
+
it "should remove all new lines" do
|
192
|
+
(NLP.clean_sentence("
|
193
|
+
This sentence
|
194
|
+
has lots of
|
195
|
+
line breaks
|
196
|
+
in it
|
197
|
+
") =~ /\n|\./).should == nil
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def equate(foo, bar)
|
202
|
+
MyNLP.canonicalize(foo).should == MyNLP.canonicalize(bar)
|
203
|
+
end
|
204
|
+
|
205
|
+
describe "canonicalization" do
|
206
|
+
it "should identify plurals" do
|
207
|
+
equate("kitties", "kitty")
|
208
|
+
end
|
209
|
+
|
210
|
+
it "should identify strings that differ only in non alphanumeric characters" do
|
211
|
+
equate("foo/bar/baz", "foo bar baz")
|
212
|
+
end
|
213
|
+
|
214
|
+
it "should be insensitive to order" do
|
215
|
+
equate("foo bar baz", "bar foo baz")
|
216
|
+
end
|
217
|
+
|
218
|
+
it "should ignore stopwords" do
|
219
|
+
equate("programming in java", "java programming")
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
describe "stopword detection" do
|
224
|
+
it "should mark a as a stopword" do
|
225
|
+
MyNLP.stopword?("a").should be(true)
|
226
|
+
end
|
227
|
+
|
228
|
+
it "should not be fooled by capitalisation" do
|
229
|
+
MyNLP.stopword?("A").should be(true)
|
230
|
+
end
|
231
|
+
end
|