raingrams 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,111 @@
1
+ require 'raingrams/bigram_model'
2
+
3
+ require 'spec_helper'
4
+ require 'model_examples'
5
+
6
+ describe BigramModel do
7
+ before(:all) do
8
+ @model = BigramModel.train_with_text(Training.text_for(:snowcrash))
9
+ end
10
+
11
+ it_should_behave_like "Model"
12
+
13
+ it "should return ngrams from specified words" do
14
+ words = %w{Why is the Deliverator so equipped}
15
+ ngrams = [
16
+ Ngram[:Why, :is],
17
+ Ngram[:is, :the],
18
+ Ngram[:the, :Deliverator],
19
+ Ngram[:Deliverator, :so],
20
+ Ngram[:so, :equipped]
21
+ ]
22
+
23
+ @model.ngrams_from_words(words).should == ngrams
24
+ end
25
+
26
+ it "should return common ngrams from words" do
27
+ words = %w{The Deliverator is a future Archetype}
28
+ ngrams = [
29
+ Ngram[:The, :Deliverator],
30
+ Ngram[:Deliverator, :is],
31
+ Ngram[:is, :a]
32
+ ]
33
+
34
+ @model.common_ngrams_from_words(words).should == ngrams
35
+ end
36
+
37
+ it "should return common ngrams from a specified fragment of text" do
38
+ fragment = %{The Deliverator is a future Archetype}
39
+ ngrams = [
40
+ Ngram[:The, :Deliverator],
41
+ Ngram[:Deliverator, :is],
42
+ Ngram[:is, :a]
43
+ ]
44
+
45
+ @model.common_ngrams_from_fragment(fragment).should == ngrams
46
+ end
47
+
48
+ it "should return common ngrams from a specified sentence" do
49
+ sentence = %{The Deliverator is a future Archetype.}
50
+ ngrams = [
51
+ Ngram[Tokens.start, Tokens.start],
52
+ Ngram[Tokens.start, :The],
53
+ Ngram[:The, :Deliverator],
54
+ Ngram[:Deliverator, :is],
55
+ Ngram[:is, :a],
56
+ Ngram[Tokens.stop, Tokens.stop]
57
+ ]
58
+
59
+ @model.common_ngrams_from_sentence(sentence).should == ngrams
60
+ end
61
+
62
+ it "should have a frequency for a specified ngram" do
63
+ ngram = Ngram[:teensy, :darts]
64
+
65
+ @model.frequency_of_ngram(ngram).should == 1
66
+ end
67
+
68
+ it "should have a probability for a specified ngram" do
69
+ ngram = Ngram[:teensy, :darts]
70
+
71
+ @model.probability_of_ngram(ngram).should == 1.0
72
+ end
73
+
74
+ it "should have a frequency for specified ngrams" do
75
+ ngrams = NgramSet[
76
+ Ngram[:but, :excess],
77
+ Ngram[:freshly, :napalmed],
78
+ Ngram[:sintered, :armorgel]
79
+ ]
80
+
81
+ @model.frequency_of_ngrams(ngrams).should == 3
82
+ end
83
+
84
+ it "should have a probability of specified ngrams" do
85
+ ngrams = NgramSet[
86
+ Ngram[:The, :Deliverator],
87
+ Ngram[:Deliverator, :belongs],
88
+ Ngram[:belongs, :to]
89
+ ]
90
+
91
+ @model.probability_of_ngrams(ngrams).to_s.should == '0.0112293144208038'
92
+ end
93
+
94
+ it "should have a probability for a specified fragment of text" do
95
+ fragment = %{The Deliverator belongs to}
96
+
97
+ @model.fragment_probability(fragment).to_s.should == '0.0112293144208038'
98
+ end
99
+
100
+ it "should have a probability for a specified sentence" do
101
+ sentence = %{The Deliverator used to make software.}
102
+
103
+ @model.sentence_probability(sentence).to_s.should == '4.10042780102381e-07'
104
+ end
105
+
106
+ it "should have a probability for specified text" do
107
+ text = %{The Deliverator used to make software. Still does, sometimes.}
108
+
109
+ @model.text_probability(text).to_s.should == '2.40635434332383e-10'
110
+ end
111
+ end
@@ -0,0 +1,8 @@
1
+ module Training
2
+ def Training.text_for(name)
3
+ name = name.to_sym
4
+ path = File.join(File.dirname(__FILE__),'..','training',"#{name}.txt")
5
+
6
+ return File.read(path)
7
+ end
8
+ end
data/spec/helpers.rb ADDED
@@ -0,0 +1 @@
1
+ require 'helpers/training'
@@ -0,0 +1,83 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "Model" do
4
+ it "should have ngrams" do
5
+ @model.ngrams.each do |ngram|
6
+ @model.has_ngram?(ngram).should == true
7
+ end
8
+ end
9
+
10
+ it "should be able to iterate through all ngrams" do
11
+ @model.each_ngram do |ngram|
12
+ @model.has_ngram?(ngram).should == true
13
+ end
14
+ end
15
+
16
+ it "should be able to select ngrams with certain properties" do
17
+ ngrams = @model.ngrams_with do |ngram|
18
+ ngram.include?(:the)
19
+ end
20
+
21
+ ngrams.each do |ngram|
22
+ ngram.include?(:the).should == true
23
+ end
24
+ end
25
+
26
+ it "should be able to select ngrams starting with a specified gram" do
27
+ @model.ngrams_starting_with(:filtering).each do |ngram|
28
+ ngram.starts_with?(:filtering).should == true
29
+ end
30
+ end
31
+
32
+ it "should be able to select ngrams ending with a specified gram" do
33
+ @model.ngrams_ending_with(:sword).each do |ngram|
34
+ ngram.ends_with?(:sword).should == true
35
+ end
36
+ end
37
+
38
+ it "should be able to select ngrams including any of the specified grams" do
39
+ @model.ngrams_including_any(:The, :Deliverator).each do |ngram|
40
+ ngram.includes_any?(:The, :Deliverator).should == true
41
+ end
42
+ end
43
+
44
+ it "should be able to select ngrams including all of the specified grams" do
45
+ @model.ngrams_including_all(:activated, :charcoal).each do |ngram|
46
+ ngram.includes_all?(:activated, :charcoal).should == true
47
+ end
48
+ end
49
+
50
+ it "should have grams" do
51
+ @model.grams.each do |gram|
52
+ @model.has_gram?(gram).should == true
53
+ end
54
+ end
55
+
56
+ it "should provide a random ngram" do
57
+ @model.has_ngram?(@model.random_ngram).should == true
58
+ end
59
+
60
+ it "should generate a random sentence" do
61
+ sentence = @model.random_sentence
62
+
63
+ @model.ngrams_from_sentence(sentence).each do |ngram|
64
+ @model.has_ngram?(ngram).should == true
65
+ end
66
+ end
67
+
68
+ it "should generate a random paragraph" do
69
+ paragraph = @model.random_paragraph
70
+
71
+ @model.ngrams_from_paragraph(paragraph).each do |ngram|
72
+ @model.has_ngram?(ngram).should == true
73
+ end
74
+ end
75
+
76
+ it "should generate a random text" do
77
+ text = @model.random_text
78
+
79
+ @model.ngrams_from_text(text).each do |ngram|
80
+ @model.has_ngram?(ngram).should == true
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,118 @@
1
+ require 'spec_helper'
2
+
3
+ require 'raingrams/model'
4
+
5
+ describe Model do
6
+ before(:all) do
7
+ @model = Model.new(:ngram_size => 2)
8
+
9
+ @phone_number_model = Model.new(
10
+ :ngram_size => 2,
11
+ :ignore_phone_numbers => true
12
+ )
13
+
14
+ @references_model = Model.new(
15
+ :ngram_size => 2,
16
+ :ignore_references => true
17
+ )
18
+
19
+ @case_model = Model.new(
20
+ :ngram_size => 2,
21
+ :ignore_case => true
22
+ )
23
+
24
+ @punctuation_model = Model.new(
25
+ :ngram_size => 2,
26
+ :ignore_punctuation => false
27
+ )
28
+ end
29
+
30
+ it "should parse text into sentences" do
31
+ text = %{The Deliverator belongs to an elite order, a hallowed sub-category. He's got esprit up to here.}
32
+ sentences = [
33
+ "The Deliverator belongs to an elite order, a hallowed sub-category.",
34
+ "He's got esprit up to here."
35
+ ]
36
+
37
+ @model.parse_text(text).should == sentences
38
+ end
39
+
40
+ it "should parse words from a sentence" do
41
+ sentence = %{The Deliverator is in touch with the road, starts like a bad day, stops on a peseta.}
42
+ words = %w{The Deliverator is in touch with the road starts like a bad day stops on a peseta}
43
+
44
+ @model.parse_sentence(sentence).should == words
45
+ end
46
+
47
+ it "should ignore URLs by default while parsing a sentence" do
48
+ sentence = %{Click on the following link: http://www.example.com/}
49
+ words = %w{Click on the following link}
50
+
51
+ @model.parse_sentence(sentence).should == words
52
+ end
53
+
54
+ it "should ignore short URIs by default while parsing a sentence" do
55
+ sentence = %{Click on the following link: jabber://}
56
+ words = %w{Click on the following link}
57
+
58
+ @model.parse_sentence(sentence).should == words
59
+ end
60
+
61
+ it "should ignore complex HTTP URLs by default while parsing a sentence" do
62
+ sentence = %{Click on the following link: http://www.google.com/search?hl=en&client=firefox-a&rls=org.mozilla:en-US:official&hs=jU&q=ruby+datamapper&start=20&sa=N}
63
+ words = %w{Click on the following link}
64
+
65
+ @model.parse_sentence(sentence).should == words
66
+ end
67
+
68
+ it "may ignore phone numbers while parsing a sentence" do
69
+ sentence = %{Call me before 12, 1-888-444-2222.}
70
+ words = %w{Call me before 12}
71
+
72
+ @phone_number_model.parse_sentence(sentence).should == words
73
+ end
74
+
75
+ it "may ignore long-distance phone numbers while parsing a sentence" do
76
+ sentence = %{Call me before 12, 1-444-2222.}
77
+ words = %w{Call me before 12}
78
+
79
+ @phone_number_model.parse_sentence(sentence).should == words
80
+ end
81
+
82
+ it "may ignore short phone numbers while parsing a sentence" do
83
+ sentence = %{Call me before 12, 444-2222.}
84
+ words = %w{Call me before 12}
85
+
86
+ @phone_number_model.parse_sentence(sentence).should == words
87
+ end
88
+
89
+ it "may ignore RFC style references while parsing a sentence" do
90
+ sentence = %{As one can see, it has failed [1].}
91
+ words = %w{As one can see it has failed}
92
+
93
+ @references_model.parse_sentence(sentence).should == words
94
+ end
95
+
96
+ it "should ignore punctuation by default while parsing a sentence" do
97
+ sentence = %{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
98
+ words = %w{
99
+ Oh they used to argue over times many corporate driver-years lost to it homeowners red-faced and sweaty with their own lies stinking of Old Spice and job-related stress standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink I swear can't you guys tell time
100
+ }
101
+
102
+ @model.parse_sentence(sentence).should == words
103
+ end
104
+
105
+ it "may ignore case while parsing a sentence" do
106
+ sentence = %{The Deliverator is in touch with the road, starts like a bad day, stops on a peseta.}
107
+ words = %w{the deliverator is in touch with the road starts like a bad day stops on a peseta}
108
+
109
+ @case_model.parse_sentence(sentence).should == words
110
+ end
111
+
112
+ it "may preserve punctuation while parsing a sentence" do
113
+ sentence = %{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
114
+ words = %w{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
115
+
116
+ @punctuation_model.parse_sentence(sentence).should == words
117
+ end
118
+ end
@@ -46,9 +46,18 @@ describe NgramSet do
46
46
  ]
47
47
  end
48
48
 
49
- it "should select ngrams which includes specified grams" do
50
- @ngrams.includes(:the, :dog).should == NgramSet[
49
+ it "should select ngrams which include any of the specified grams" do
50
+ @ngrams.including_any(:the, :dog).should == NgramSet[
51
51
  Ngram[:the, :dog],
52
+ Ngram[:dog, :jumped],
53
+ Ngram[:through, :the],
54
+ Ngram[:the, :hoop]
55
+ ]
56
+ end
57
+
58
+ it "should select ngrams which include all of the specified grams" do
59
+ @ngrams.including_all(:the, :dog).should == NgramSet[
60
+ Ngram[:the, :dog]
52
61
  ]
53
62
  end
54
63
  end
data/spec/ngram_spec.rb CHANGED
@@ -24,6 +24,6 @@ describe Ngram do
24
24
  end
25
25
 
26
26
  it "should include certain grams" do
27
- @ngram.includes?(:one, :three).should == true
27
+ @ngram.includes_all?(:one, :three).should == true
28
28
  end
29
29
  end
@@ -0,0 +1,101 @@
1
+ require 'raingrams/pentagram_model'
2
+
3
+ require 'spec_helper'
4
+ require 'model_examples'
5
+
6
+ describe PentagramModel do
7
+ before(:all) do
8
+ @model = PentagramModel.build do |model|
9
+ model.train_with_text(Training.text_for(:snowcrash))
10
+ end
11
+ end
12
+
13
+ it_should_behave_like "Model"
14
+
15
+ it "should return ngrams from specified words" do
16
+ words = %w{Why is the Deliverator so equipped}
17
+ ngrams = [
18
+ Ngram[:Why, :is, :the, :Deliverator, :so],
19
+ Ngram[:is, :the, :Deliverator, :so, :equipped]
20
+ ]
21
+
22
+ @model.ngrams_from_words(words).should == ngrams
23
+ end
24
+
25
+ it "should return common ngrams from words" do
26
+ words = %w{The Deliverator is a future Archetype}
27
+ ngrams = []
28
+
29
+ @model.common_ngrams_from_words(words).should == ngrams
30
+ end
31
+
32
+ it "should return common ngrams from a specified fragment of text" do
33
+ fragment = %{The Deliverator is a future Archetype}
34
+ ngrams = []
35
+
36
+ @model.common_ngrams_from_fragment(fragment).should == ngrams
37
+ end
38
+
39
+ it "should return common ngrams from a specified sentence" do
40
+ sentence = %{The Deliverator is a future Archetype.}
41
+ ngrams = [
42
+ Ngram[Tokens.start, Tokens.start, Tokens.start, Tokens.start, Tokens.start],
43
+ Ngram[Tokens.start, Tokens.start, Tokens.start, Tokens.start, :The],
44
+ Ngram[Tokens.start, Tokens.start, Tokens.start, :The, :Deliverator],
45
+ Ngram[Tokens.start, Tokens.start, :The, :Deliverator, :is],
46
+ Ngram[Tokens.start, :The, :Deliverator, :is, :a],
47
+ Ngram[Tokens.stop, Tokens.stop, Tokens.stop, Tokens.stop, Tokens.stop]
48
+ ]
49
+
50
+ @model.common_ngrams_from_sentence(sentence).should == ngrams
51
+ end
52
+
53
+ it "should have a frequency for a specified ngram" do
54
+ ngram = Ngram[:it, :fires, :teensy, :darts, :that]
55
+
56
+ @model.frequency_of_ngram(ngram).should == 1
57
+ end
58
+
59
+ it "should have a probability for a specified ngram" do
60
+ ngram = Ngram[:it, :fires, :teensy, :darts, :that]
61
+
62
+ @model.probability_of_ngram(ngram).should == 1.0
63
+ end
64
+
65
+ it "should have a frequency for specified ngrams" do
66
+ ngrams = NgramSet[
67
+ Ngram[:but, :excess, :perspiration, :wafts, :through],
68
+ Ngram[:through, :a, :freshly, :napalmed, :forest],
69
+ Ngram[:the, :suit, :has, :sintered, :armorgel]
70
+ ]
71
+
72
+ @model.frequency_of_ngrams(ngrams).should == 3
73
+ end
74
+
75
+ it "should have a probability of specified ngrams" do
76
+ ngrams = NgramSet[
77
+ Ngram[:The, :Deliverator, :belongs, :to, :an],
78
+ Ngram[:Deliverator, :belongs, :to, :an, :elite]
79
+ ]
80
+
81
+ @model.probability_of_ngrams(ngrams).to_s.should == '1.0'
82
+ end
83
+
84
+ it "should have a probability for a specified fragment of text" do
85
+ fragment = %{The Deliverator belongs to an}
86
+
87
+ @model.fragment_probability(fragment).to_s.should == '1.0'
88
+ end
89
+
90
+ it "should have a probability for a specified sentence" do
91
+ sentence = %{So now he has this other job.}
92
+
93
+ @model.sentence_probability(sentence).to_s.should == '0.00117370892018779'
94
+ end
95
+
96
+ it "should have a probability for specified text" do
97
+ text = %{So now he has this other job. No brightness or creativity involved-but no cooperation either.}
98
+
99
+ @model.text_probability(text).to_s.should == '2.75518525865679e-06'
100
+ end
101
+ end
@@ -0,0 +1,106 @@
1
+ require 'raingrams/quadgram_model'
2
+
3
+ require 'spec_helper'
4
+ require 'model_examples'
5
+
6
+ describe QuadgramModel do
7
+ before(:all) do
8
+ @model = QuadgramModel.build do |model|
9
+ model.train_with_text(Training.text_for(:snowcrash))
10
+ end
11
+ end
12
+
13
+ it_should_behave_like "Model"
14
+
15
+ it "should return ngrams from specified words" do
16
+ words = %w{Why is the Deliverator so equipped}
17
+ ngrams = [
18
+ Ngram[:Why, :is, :the, :Deliverator],
19
+ Ngram[:is, :the, :Deliverator, :so],
20
+ Ngram[:the, :Deliverator, :so, :equipped]
21
+ ]
22
+
23
+ @model.ngrams_from_words(words).should == ngrams
24
+ end
25
+
26
+ it "should return common ngrams from words" do
27
+ words = %w{The Deliverator is a future Archetype}
28
+ ngrams = [
29
+ Ngram[:The, :Deliverator, :is, :a]
30
+ ]
31
+
32
+ @model.common_ngrams_from_words(words).should == ngrams
33
+ end
34
+
35
+ it "should return common ngrams from a specified fragment of text" do
36
+ fragment = %{The Deliverator is a future Archetype}
37
+ ngrams = [
38
+ Ngram[:The, :Deliverator, :is, :a]
39
+ ]
40
+
41
+ @model.common_ngrams_from_fragment(fragment).should == ngrams
42
+ end
43
+
44
+ it "should return common ngrams from a specified sentence" do
45
+ sentence = %{The Deliverator is a future Archetype.}
46
+ ngrams = [
47
+ Ngram[Tokens.start, Tokens.start, Tokens.start, Tokens.start],
48
+ Ngram[Tokens.start, Tokens.start, Tokens.start, :The],
49
+ Ngram[Tokens.start, Tokens.start, :The, :Deliverator],
50
+ Ngram[Tokens.start, :The, :Deliverator, :is],
51
+ Ngram[:The, :Deliverator, :is, :a],
52
+ Ngram[Tokens.stop, Tokens.stop, Tokens.stop, Tokens.stop]
53
+ ]
54
+
55
+ @model.common_ngrams_from_sentence(sentence).should == ngrams
56
+ end
57
+
58
+ it "should have a frequency for a specified ngram" do
59
+ ngram = Ngram[:it, :fires, :teensy, :darts]
60
+
61
+ @model.frequency_of_ngram(ngram).should == 1
62
+ end
63
+
64
+ it "should have a probability for a specified ngram" do
65
+ ngram = Ngram[:it, :fires, :teensy, :darts]
66
+
67
+ @model.probability_of_ngram(ngram).should == 1.0
68
+ end
69
+
70
+ it "should have a frequency for specified ngrams" do
71
+ ngrams = NgramSet[
72
+ Ngram[:but, :excess, :perspiration, :wafts],
73
+ Ngram[:a, :freshly, :napalmed, :forest],
74
+ Ngram[:suit, :has, :sintered, :armorgel]
75
+ ]
76
+
77
+ @model.frequency_of_ngrams(ngrams).should == 3
78
+ end
79
+
80
+ it "should have a probability of specified ngrams" do
81
+ ngrams = NgramSet[
82
+ Ngram[:The, :Deliverator, :belongs, :to],
83
+ Ngram[:Deliverator, :belongs, :to, :an]
84
+ ]
85
+
86
+ @model.probability_of_ngrams(ngrams).to_s.should == '1.0'
87
+ end
88
+
89
+ it "should have a probability for a specified fragment of text" do
90
+ fragment = %{The Deliverator belongs to}
91
+
92
+ @model.fragment_probability(fragment).to_s.should == '1.0'
93
+ end
94
+
95
+ it "should have a probability for a specified sentence" do
96
+ sentence = %{So now he has this other job.}
97
+
98
+ @model.sentence_probability(sentence).to_s.should == '0.00117370892018779'
99
+ end
100
+
101
+ it "should have a probability for specified text" do
102
+ text = %{So now he has this other job. No brightness or creativity involved-but no cooperation either.}
103
+
104
+ @model.text_probability(text).to_s.should == '2.75518525865679e-06'
105
+ end
106
+ end
data/spec/spec_helper.rb CHANGED
@@ -3,3 +3,5 @@ gem 'rspec', '>=1.1.3'
3
3
  require 'spec'
4
4
 
5
5
  include Raingrams
6
+
7
+ require 'helpers'