raingrams 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,111 @@
1
+ require 'raingrams/bigram_model'
2
+
3
+ require 'spec_helper'
4
+ require 'model_examples'
5
+
6
+ describe BigramModel do
7
+ before(:all) do
8
+ @model = BigramModel.train_with_text(Training.text_for(:snowcrash))
9
+ end
10
+
11
+ it_should_behave_like "Model"
12
+
13
+ it "should return ngrams from specified words" do
14
+ words = %w{Why is the Deliverator so equipped}
15
+ ngrams = [
16
+ Ngram[:Why, :is],
17
+ Ngram[:is, :the],
18
+ Ngram[:the, :Deliverator],
19
+ Ngram[:Deliverator, :so],
20
+ Ngram[:so, :equipped]
21
+ ]
22
+
23
+ @model.ngrams_from_words(words).should == ngrams
24
+ end
25
+
26
+ it "should return common ngrams from words" do
27
+ words = %w{The Deliverator is a future Archetype}
28
+ ngrams = [
29
+ Ngram[:The, :Deliverator],
30
+ Ngram[:Deliverator, :is],
31
+ Ngram[:is, :a]
32
+ ]
33
+
34
+ @model.common_ngrams_from_words(words).should == ngrams
35
+ end
36
+
37
+ it "should return common ngrams from a specified fragment of text" do
38
+ fragment = %{The Deliverator is a future Archetype}
39
+ ngrams = [
40
+ Ngram[:The, :Deliverator],
41
+ Ngram[:Deliverator, :is],
42
+ Ngram[:is, :a]
43
+ ]
44
+
45
+ @model.common_ngrams_from_fragment(fragment).should == ngrams
46
+ end
47
+
48
+ it "should return common ngrams from a specified sentence" do
49
+ sentence = %{The Deliverator is a future Archetype.}
50
+ ngrams = [
51
+ Ngram[Tokens.start, Tokens.start],
52
+ Ngram[Tokens.start, :The],
53
+ Ngram[:The, :Deliverator],
54
+ Ngram[:Deliverator, :is],
55
+ Ngram[:is, :a],
56
+ Ngram[Tokens.stop, Tokens.stop]
57
+ ]
58
+
59
+ @model.common_ngrams_from_sentence(sentence).should == ngrams
60
+ end
61
+
62
+ it "should have a frequency for a specified ngram" do
63
+ ngram = Ngram[:teensy, :darts]
64
+
65
+ @model.frequency_of_ngram(ngram).should == 1
66
+ end
67
+
68
+ it "should have a probability for a specified ngram" do
69
+ ngram = Ngram[:teensy, :darts]
70
+
71
+ @model.probability_of_ngram(ngram).should == 1.0
72
+ end
73
+
74
+ it "should have a frequency for specified ngrams" do
75
+ ngrams = NgramSet[
76
+ Ngram[:but, :excess],
77
+ Ngram[:freshly, :napalmed],
78
+ Ngram[:sintered, :armorgel]
79
+ ]
80
+
81
+ @model.frequency_of_ngrams(ngrams).should == 3
82
+ end
83
+
84
+ it "should have a probability of specified ngrams" do
85
+ ngrams = NgramSet[
86
+ Ngram[:The, :Deliverator],
87
+ Ngram[:Deliverator, :belongs],
88
+ Ngram[:belongs, :to]
89
+ ]
90
+
91
+ @model.probability_of_ngrams(ngrams).to_s.should == '0.0112293144208038'
92
+ end
93
+
94
+ it "should have a probability for a specified fragment of text" do
95
+ fragment = %{The Deliverator belongs to}
96
+
97
+ @model.fragment_probability(fragment).to_s.should == '0.0112293144208038'
98
+ end
99
+
100
+ it "should have a probability for a specified sentence" do
101
+ sentence = %{The Deliverator used to make software.}
102
+
103
+ @model.sentence_probability(sentence).to_s.should == '4.10042780102381e-07'
104
+ end
105
+
106
+ it "should have a probability for specified text" do
107
+ text = %{The Deliverator used to make software. Still does, sometimes.}
108
+
109
+ @model.text_probability(text).to_s.should == '2.40635434332383e-10'
110
+ end
111
+ end
@@ -0,0 +1,8 @@
1
+ module Training
2
+ def Training.text_for(name)
3
+ name = name.to_sym
4
+ path = File.join(File.dirname(__FILE__),'..','training',"#{name}.txt")
5
+
6
+ return File.read(path)
7
+ end
8
+ end
data/spec/helpers.rb ADDED
@@ -0,0 +1 @@
1
+ require 'helpers/training'
@@ -0,0 +1,83 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "Model" do
4
+ it "should have ngrams" do
5
+ @model.ngrams.each do |ngram|
6
+ @model.has_ngram?(ngram).should == true
7
+ end
8
+ end
9
+
10
+ it "should be able to iterate through all ngrams" do
11
+ @model.each_ngram do |ngram|
12
+ @model.has_ngram?(ngram).should == true
13
+ end
14
+ end
15
+
16
+ it "should be able to select ngrams with certain properties" do
17
+ ngrams = @model.ngrams_with do |ngram|
18
+ ngram.include?(:the)
19
+ end
20
+
21
+ ngrams.each do |ngram|
22
+ ngram.include?(:the).should == true
23
+ end
24
+ end
25
+
26
+ it "should be able to select ngrams starting with a specified gram" do
27
+ @model.ngrams_starting_with(:filtering).each do |ngram|
28
+ ngram.starts_with?(:filtering).should == true
29
+ end
30
+ end
31
+
32
+ it "should be able to select ngrams ending with a specified gram" do
33
+ @model.ngrams_ending_with(:sword).each do |ngram|
34
+ ngram.ends_with?(:sword).should == true
35
+ end
36
+ end
37
+
38
+ it "should be able to select ngrams including any of the specified grams" do
39
+ @model.ngrams_including_any(:The, :Deliverator).each do |ngram|
40
+ ngram.includes_any?(:The, :Deliverator).should == true
41
+ end
42
+ end
43
+
44
+ it "should be able to select ngrams including all of the specified grams" do
45
+ @model.ngrams_including_all(:activated, :charcoal).each do |ngram|
46
+ ngram.includes_all?(:activated, :charcoal).should == true
47
+ end
48
+ end
49
+
50
+ it "should have grams" do
51
+ @model.grams.each do |gram|
52
+ @model.has_gram?(gram).should == true
53
+ end
54
+ end
55
+
56
+ it "should provide a random ngram" do
57
+ @model.has_ngram?(@model.random_ngram).should == true
58
+ end
59
+
60
+ it "should generate a random sentence" do
61
+ sentence = @model.random_sentence
62
+
63
+ @model.ngrams_from_sentence(sentence).each do |ngram|
64
+ @model.has_ngram?(ngram).should == true
65
+ end
66
+ end
67
+
68
+ it "should generate a random paragraph" do
69
+ paragraph = @model.random_paragraph
70
+
71
+ @model.ngrams_from_paragraph(paragraph).each do |ngram|
72
+ @model.has_ngram?(ngram).should == true
73
+ end
74
+ end
75
+
76
+ it "should generate a random text" do
77
+ text = @model.random_text
78
+
79
+ @model.ngrams_from_text(text).each do |ngram|
80
+ @model.has_ngram?(ngram).should == true
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,118 @@
1
+ require 'spec_helper'
2
+
3
+ require 'raingrams/model'
4
+
5
+ describe Model do
6
+ before(:all) do
7
+ @model = Model.new(:ngram_size => 2)
8
+
9
+ @phone_number_model = Model.new(
10
+ :ngram_size => 2,
11
+ :ignore_phone_numbers => true
12
+ )
13
+
14
+ @references_model = Model.new(
15
+ :ngram_size => 2,
16
+ :ignore_references => true
17
+ )
18
+
19
+ @case_model = Model.new(
20
+ :ngram_size => 2,
21
+ :ignore_case => true
22
+ )
23
+
24
+ @punctuation_model = Model.new(
25
+ :ngram_size => 2,
26
+ :ignore_punctuation => false
27
+ )
28
+ end
29
+
30
+ it "should parse text into sentences" do
31
+ text = %{The Deliverator belongs to an elite order, a hallowed sub-category. He's got esprit up to here.}
32
+ sentences = [
33
+ "The Deliverator belongs to an elite order, a hallowed sub-category.",
34
+ "He's got esprit up to here."
35
+ ]
36
+
37
+ @model.parse_text(text).should == sentences
38
+ end
39
+
40
+ it "should parse words from a sentence" do
41
+ sentence = %{The Deliverator is in touch with the road, starts like a bad day, stops on a peseta.}
42
+ words = %w{The Deliverator is in touch with the road starts like a bad day stops on a peseta}
43
+
44
+ @model.parse_sentence(sentence).should == words
45
+ end
46
+
47
+ it "should ignore URLs by default while parsing a sentence" do
48
+ sentence = %{Click on the following link: http://www.example.com/}
49
+ words = %w{Click on the following link}
50
+
51
+ @model.parse_sentence(sentence).should == words
52
+ end
53
+
54
+ it "should ignore short URIs by default while parsing a sentence" do
55
+ sentence = %{Click on the following link: jabber://}
56
+ words = %w{Click on the following link}
57
+
58
+ @model.parse_sentence(sentence).should == words
59
+ end
60
+
61
+ it "should ignore complex HTTP URLs by default while parsing a sentence" do
62
+ sentence = %{Click on the following link: http://www.google.com/search?hl=en&client=firefox-a&rls=org.mozilla:en-US:official&hs=jU&q=ruby+datamapper&start=20&sa=N}
63
+ words = %w{Click on the following link}
64
+
65
+ @model.parse_sentence(sentence).should == words
66
+ end
67
+
68
+ it "may ignore phone numbers while parsing a sentence" do
69
+ sentence = %{Call me before 12, 1-888-444-2222.}
70
+ words = %w{Call me before 12}
71
+
72
+ @phone_number_model.parse_sentence(sentence).should == words
73
+ end
74
+
75
+ it "may ignore long-distance phone numbers while parsing a sentence" do
76
+ sentence = %{Call me before 12, 1-444-2222.}
77
+ words = %w{Call me before 12}
78
+
79
+ @phone_number_model.parse_sentence(sentence).should == words
80
+ end
81
+
82
+ it "may ignore short phone numbers while parsing a sentence" do
83
+ sentence = %{Call me before 12, 444-2222.}
84
+ words = %w{Call me before 12}
85
+
86
+ @phone_number_model.parse_sentence(sentence).should == words
87
+ end
88
+
89
+ it "may ignore RFC style references while parsing a sentence" do
90
+ sentence = %{As one can see, it has failed [1].}
91
+ words = %w{As one can see it has failed}
92
+
93
+ @references_model.parse_sentence(sentence).should == words
94
+ end
95
+
96
+ it "should ignore punctuation by default while parsing a sentence" do
97
+ sentence = %{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
98
+ words = %w{
99
+ Oh they used to argue over times many corporate driver-years lost to it homeowners red-faced and sweaty with their own lies stinking of Old Spice and job-related stress standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink I swear can't you guys tell time
100
+ }
101
+
102
+ @model.parse_sentence(sentence).should == words
103
+ end
104
+
105
+ it "may ignore case while parsing a sentence" do
106
+ sentence = %{The Deliverator is in touch with the road, starts like a bad day, stops on a peseta.}
107
+ words = %w{the deliverator is in touch with the road starts like a bad day stops on a peseta}
108
+
109
+ @case_model.parse_sentence(sentence).should == words
110
+ end
111
+
112
+ it "may preserve punctuation while parsing a sentence" do
113
+ sentence = %{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
114
+ words = %w{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
115
+
116
+ @punctuation_model.parse_sentence(sentence).should == words
117
+ end
118
+ end
@@ -46,9 +46,18 @@ describe NgramSet do
46
46
  ]
47
47
  end
48
48
 
49
- it "should select ngrams which includes specified grams" do
50
- @ngrams.includes(:the, :dog).should == NgramSet[
49
+ it "should select ngrams which include any of the specified grams" do
50
+ @ngrams.including_any(:the, :dog).should == NgramSet[
51
51
  Ngram[:the, :dog],
52
+ Ngram[:dog, :jumped],
53
+ Ngram[:through, :the],
54
+ Ngram[:the, :hoop]
55
+ ]
56
+ end
57
+
58
+ it "should select ngrams which include all of the specified grams" do
59
+ @ngrams.including_all(:the, :dog).should == NgramSet[
60
+ Ngram[:the, :dog]
52
61
  ]
53
62
  end
54
63
  end
data/spec/ngram_spec.rb CHANGED
@@ -24,6 +24,6 @@ describe Ngram do
24
24
  end
25
25
 
26
26
  it "should include certain grams" do
27
- @ngram.includes?(:one, :three).should == true
27
+ @ngram.includes_all?(:one, :three).should == true
28
28
  end
29
29
  end
@@ -0,0 +1,101 @@
1
+ require 'raingrams/pentagram_model'
2
+
3
+ require 'spec_helper'
4
+ require 'model_examples'
5
+
6
+ describe PentagramModel do
7
+ before(:all) do
8
+ @model = PentagramModel.build do |model|
9
+ model.train_with_text(Training.text_for(:snowcrash))
10
+ end
11
+ end
12
+
13
+ it_should_behave_like "Model"
14
+
15
+ it "should return ngrams from specified words" do
16
+ words = %w{Why is the Deliverator so equipped}
17
+ ngrams = [
18
+ Ngram[:Why, :is, :the, :Deliverator, :so],
19
+ Ngram[:is, :the, :Deliverator, :so, :equipped]
20
+ ]
21
+
22
+ @model.ngrams_from_words(words).should == ngrams
23
+ end
24
+
25
+ it "should return common ngrams from words" do
26
+ words = %w{The Deliverator is a future Archetype}
27
+ ngrams = []
28
+
29
+ @model.common_ngrams_from_words(words).should == ngrams
30
+ end
31
+
32
+ it "should return common ngrams from a specified fragment of text" do
33
+ fragment = %{The Deliverator is a future Archetype}
34
+ ngrams = []
35
+
36
+ @model.common_ngrams_from_fragment(fragment).should == ngrams
37
+ end
38
+
39
+ it "should return common ngrams from a specified sentence" do
40
+ sentence = %{The Deliverator is a future Archetype.}
41
+ ngrams = [
42
+ Ngram[Tokens.start, Tokens.start, Tokens.start, Tokens.start, Tokens.start],
43
+ Ngram[Tokens.start, Tokens.start, Tokens.start, Tokens.start, :The],
44
+ Ngram[Tokens.start, Tokens.start, Tokens.start, :The, :Deliverator],
45
+ Ngram[Tokens.start, Tokens.start, :The, :Deliverator, :is],
46
+ Ngram[Tokens.start, :The, :Deliverator, :is, :a],
47
+ Ngram[Tokens.stop, Tokens.stop, Tokens.stop, Tokens.stop, Tokens.stop]
48
+ ]
49
+
50
+ @model.common_ngrams_from_sentence(sentence).should == ngrams
51
+ end
52
+
53
+ it "should have a frequency for a specified ngram" do
54
+ ngram = Ngram[:it, :fires, :teensy, :darts, :that]
55
+
56
+ @model.frequency_of_ngram(ngram).should == 1
57
+ end
58
+
59
+ it "should have a probability for a specified ngram" do
60
+ ngram = Ngram[:it, :fires, :teensy, :darts, :that]
61
+
62
+ @model.probability_of_ngram(ngram).should == 1.0
63
+ end
64
+
65
+ it "should have a frequency for specified ngrams" do
66
+ ngrams = NgramSet[
67
+ Ngram[:but, :excess, :perspiration, :wafts, :through],
68
+ Ngram[:through, :a, :freshly, :napalmed, :forest],
69
+ Ngram[:the, :suit, :has, :sintered, :armorgel]
70
+ ]
71
+
72
+ @model.frequency_of_ngrams(ngrams).should == 3
73
+ end
74
+
75
+ it "should have a probability of specified ngrams" do
76
+ ngrams = NgramSet[
77
+ Ngram[:The, :Deliverator, :belongs, :to, :an],
78
+ Ngram[:Deliverator, :belongs, :to, :an, :elite]
79
+ ]
80
+
81
+ @model.probability_of_ngrams(ngrams).to_s.should == '1.0'
82
+ end
83
+
84
+ it "should have a probability for a specified fragment of text" do
85
+ fragment = %{The Deliverator belongs to an}
86
+
87
+ @model.fragment_probability(fragment).to_s.should == '1.0'
88
+ end
89
+
90
+ it "should have a probability for a specified sentence" do
91
+ sentence = %{So now he has this other job.}
92
+
93
+ @model.sentence_probability(sentence).to_s.should == '0.00117370892018779'
94
+ end
95
+
96
+ it "should have a probability for specified text" do
97
+ text = %{So now he has this other job. No brightness or creativity involved-but no cooperation either.}
98
+
99
+ @model.text_probability(text).to_s.should == '2.75518525865679e-06'
100
+ end
101
+ end
@@ -0,0 +1,106 @@
1
+ require 'raingrams/quadgram_model'
2
+
3
+ require 'spec_helper'
4
+ require 'model_examples'
5
+
6
+ describe QuadgramModel do
7
+ before(:all) do
8
+ @model = QuadgramModel.build do |model|
9
+ model.train_with_text(Training.text_for(:snowcrash))
10
+ end
11
+ end
12
+
13
+ it_should_behave_like "Model"
14
+
15
+ it "should return ngrams from specified words" do
16
+ words = %w{Why is the Deliverator so equipped}
17
+ ngrams = [
18
+ Ngram[:Why, :is, :the, :Deliverator],
19
+ Ngram[:is, :the, :Deliverator, :so],
20
+ Ngram[:the, :Deliverator, :so, :equipped]
21
+ ]
22
+
23
+ @model.ngrams_from_words(words).should == ngrams
24
+ end
25
+
26
+ it "should return common ngrams from words" do
27
+ words = %w{The Deliverator is a future Archetype}
28
+ ngrams = [
29
+ Ngram[:The, :Deliverator, :is, :a]
30
+ ]
31
+
32
+ @model.common_ngrams_from_words(words).should == ngrams
33
+ end
34
+
35
+ it "should return common ngrams from a specified fragment of text" do
36
+ fragment = %{The Deliverator is a future Archetype}
37
+ ngrams = [
38
+ Ngram[:The, :Deliverator, :is, :a]
39
+ ]
40
+
41
+ @model.common_ngrams_from_fragment(fragment).should == ngrams
42
+ end
43
+
44
+ it "should return common ngrams from a specified sentence" do
45
+ sentence = %{The Deliverator is a future Archetype.}
46
+ ngrams = [
47
+ Ngram[Tokens.start, Tokens.start, Tokens.start, Tokens.start],
48
+ Ngram[Tokens.start, Tokens.start, Tokens.start, :The],
49
+ Ngram[Tokens.start, Tokens.start, :The, :Deliverator],
50
+ Ngram[Tokens.start, :The, :Deliverator, :is],
51
+ Ngram[:The, :Deliverator, :is, :a],
52
+ Ngram[Tokens.stop, Tokens.stop, Tokens.stop, Tokens.stop]
53
+ ]
54
+
55
+ @model.common_ngrams_from_sentence(sentence).should == ngrams
56
+ end
57
+
58
+ it "should have a frequency for a specified ngram" do
59
+ ngram = Ngram[:it, :fires, :teensy, :darts]
60
+
61
+ @model.frequency_of_ngram(ngram).should == 1
62
+ end
63
+
64
+ it "should have a probability for a specified ngram" do
65
+ ngram = Ngram[:it, :fires, :teensy, :darts]
66
+
67
+ @model.probability_of_ngram(ngram).should == 1.0
68
+ end
69
+
70
+ it "should have a frequency for specified ngrams" do
71
+ ngrams = NgramSet[
72
+ Ngram[:but, :excess, :perspiration, :wafts],
73
+ Ngram[:a, :freshly, :napalmed, :forest],
74
+ Ngram[:suit, :has, :sintered, :armorgel]
75
+ ]
76
+
77
+ @model.frequency_of_ngrams(ngrams).should == 3
78
+ end
79
+
80
+ it "should have a probability of specified ngrams" do
81
+ ngrams = NgramSet[
82
+ Ngram[:The, :Deliverator, :belongs, :to],
83
+ Ngram[:Deliverator, :belongs, :to, :an]
84
+ ]
85
+
86
+ @model.probability_of_ngrams(ngrams).to_s.should == '1.0'
87
+ end
88
+
89
+ it "should have a probability for a specified fragment of text" do
90
+ fragment = %{The Deliverator belongs to}
91
+
92
+ @model.fragment_probability(fragment).to_s.should == '1.0'
93
+ end
94
+
95
+ it "should have a probability for a specified sentence" do
96
+ sentence = %{So now he has this other job.}
97
+
98
+ @model.sentence_probability(sentence).to_s.should == '0.00117370892018779'
99
+ end
100
+
101
+ it "should have a probability for specified text" do
102
+ text = %{So now he has this other job. No brightness or creativity involved-but no cooperation either.}
103
+
104
+ @model.text_probability(text).to_s.should == '2.75518525865679e-06'
105
+ end
106
+ end
data/spec/spec_helper.rb CHANGED
@@ -3,3 +3,5 @@ gem 'rspec', '>=1.1.3'
3
3
  require 'spec'
4
4
 
5
5
  include Raingrams
6
+
7
+ require 'helpers'