raingrams 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +22 -3
- data/LICENSE.txt +1 -1
- data/Manifest.txt +13 -6
- data/README.txt +27 -25
- data/Rakefile +2 -2
- data/lib/raingrams/helpers.rb +5 -0
- data/lib/raingrams/helpers/commonality.rb +67 -0
- data/lib/raingrams/helpers/frequency.rb +43 -0
- data/lib/raingrams/helpers/probability.rb +67 -0
- data/lib/raingrams/helpers/random.rb +122 -0
- data/lib/raingrams/helpers/similarity.rb +38 -0
- data/lib/raingrams/model.rb +30 -304
- data/lib/raingrams/probability_table.rb +9 -0
- data/lib/raingrams/tokens/tokens.rb +35 -0
- data/lib/raingrams/version.rb +1 -1
- data/tasks/spec.rb +2 -0
- metadata +20 -14
data/History.txt
CHANGED
@@ -1,4 +1,23 @@
|
|
1
|
-
|
1
|
+
=== 0.1.2 / 2009-04-23
|
2
|
+
|
3
|
+
* Require nokogiri >= 1.2.0.
|
4
|
+
* No longer require hpricot.
|
5
|
+
* Added missing 'lib/raingrams/tokens/tokens.rb' file to the Manifest.
|
6
|
+
* Added Raingrams::Helpers:
|
7
|
+
* Moved text commonality calculating methods into
|
8
|
+
Raingrams::Helpers::Commonality.
|
9
|
+
* Moved text frequency calculating methods into
|
10
|
+
Raingrams::Helpers::Frequency.
|
11
|
+
* Moved text probability calculating methods into
|
12
|
+
Raingrams::Helpers::Probability.
|
13
|
+
* Moved random text generating methods into
|
14
|
+
Raingrams::Helpers::Random.
|
15
|
+
* Moved text similarity calculating methods into
|
16
|
+
Raingrams::Helpers::Similarity.
|
17
|
+
* Added Model#to_hash.
|
18
|
+
* Capitalize randomly generated sentences if case is ignored.
|
19
|
+
|
20
|
+
=== 0.1.1 / 2008-10-12
|
2
21
|
|
3
22
|
* Improved the parsing abilities of Model#parse_sentence and
|
4
23
|
Model#parse_text.
|
@@ -26,7 +45,7 @@
|
|
26
45
|
* Model#frequencies_of_ngrams.
|
27
46
|
* Model#save.
|
28
47
|
|
29
|
-
|
48
|
+
=== 0.1.0 / 2008-10-06
|
30
49
|
|
31
50
|
* Various bug fixes.
|
32
51
|
* Added NgramSet and ProbabilityTable classes.
|
@@ -35,7 +54,7 @@
|
|
35
54
|
* Added random_gram_sentence, random_sentence, random_paragraph and
|
36
55
|
random_text methods to the Model class.
|
37
56
|
|
38
|
-
|
57
|
+
=== 0.0.9 / 2008-01-09
|
39
58
|
|
40
59
|
* Initial release.
|
41
60
|
* Supports all non-zero ngram sizes.
|
data/LICENSE.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -5,27 +5,33 @@ README.txt
|
|
5
5
|
TODO.txt
|
6
6
|
Rakefile
|
7
7
|
lib/raingrams.rb
|
8
|
-
lib/raingrams/version.rb
|
9
|
-
lib/raingrams/raingrams.rb
|
10
|
-
lib/raingrams/exceptions/prefix_frequency_missing.rb
|
11
8
|
lib/raingrams/exceptions.rb
|
9
|
+
lib/raingrams/exceptions/prefix_frequency_missing.rb
|
10
|
+
lib/raingrams/extensions.rb
|
12
11
|
lib/raingrams/extensions/object.rb
|
13
12
|
lib/raingrams/extensions/string.rb
|
14
|
-
lib/raingrams/
|
13
|
+
lib/raingrams/tokens.rb
|
15
14
|
lib/raingrams/tokens/token.rb
|
16
15
|
lib/raingrams/tokens/start_sentence.rb
|
17
16
|
lib/raingrams/tokens/stop_sentence.rb
|
18
17
|
lib/raingrams/tokens/unknown.rb
|
19
|
-
lib/raingrams/tokens.rb
|
18
|
+
lib/raingrams/tokens/tokens.rb
|
20
19
|
lib/raingrams/ngram.rb
|
21
20
|
lib/raingrams/ngram_set.rb
|
22
21
|
lib/raingrams/probability_table.rb
|
22
|
+
lib/raingrams/helpers.rb
|
23
|
+
lib/raingrams/helpers/frequency.rb
|
24
|
+
lib/raingrams/helpers/probability.rb
|
25
|
+
lib/raingrams/helpers/similarity.rb
|
26
|
+
lib/raingrams/helpers/commonality.rb
|
27
|
+
lib/raingrams/helpers/random.rb
|
23
28
|
lib/raingrams/model.rb
|
24
29
|
lib/raingrams/bigram_model.rb
|
25
30
|
lib/raingrams/trigram_model.rb
|
26
31
|
lib/raingrams/quadgram_model.rb
|
27
32
|
lib/raingrams/pentagram_model.rb
|
28
33
|
lib/raingrams/hexagram_model.rb
|
34
|
+
lib/raingrams/open_vocabulary.rb
|
29
35
|
lib/raingrams/open_vocabulary/open_model.rb
|
30
36
|
lib/raingrams/open_vocabulary/model.rb
|
31
37
|
lib/raingrams/open_vocabulary/bigram_model.rb
|
@@ -33,7 +39,8 @@ lib/raingrams/open_vocabulary/trigram_model.rb
|
|
33
39
|
lib/raingrams/open_vocabulary/quadgram_model.rb
|
34
40
|
lib/raingrams/open_vocabulary/pentagram_model.rb
|
35
41
|
lib/raingrams/open_vocabulary/hexagram_model.rb
|
36
|
-
lib/raingrams/
|
42
|
+
lib/raingrams/version.rb
|
43
|
+
lib/raingrams/raingrams.rb
|
37
44
|
tasks/spec.rb
|
38
45
|
spec/training/snowcrash.txt
|
39
46
|
spec/helpers/training.rb
|
data/README.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
= Raingrams
|
2
2
|
|
3
3
|
* http://raingrams.rubyforge.org/
|
4
|
-
*
|
4
|
+
* http://github.com/postmodern/raingrams/
|
5
|
+
* Postmodern (postmodern.mod3 at gmail.com)
|
5
6
|
|
6
7
|
== DESCRIPTION:
|
7
8
|
|
@@ -20,7 +21,7 @@ parsing styles and open/closed vocabulary models.
|
|
20
21
|
|
21
22
|
== REQUIREMENTS:
|
22
23
|
|
23
|
-
*
|
24
|
+
* {nokogiri}[http://nokogiri.rubyforge.org/] >= 1.2.0
|
24
25
|
|
25
26
|
== INSTALL:
|
26
27
|
|
@@ -30,47 +31,48 @@ parsing styles and open/closed vocabulary models.
|
|
30
31
|
|
31
32
|
* Train a model with ycombinator comments:
|
32
33
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
34
|
+
require 'raingrams'
|
35
|
+
require 'nokogiri'
|
36
|
+
require 'open-uri'
|
37
|
+
|
38
|
+
include Raingrams
|
39
|
+
|
40
|
+
model = BigramModel.build do |model|
|
41
|
+
doc = Nokogiri::HTML(open('http://news.ycombinator.org/newcomments'))
|
42
|
+
doc.search('span.comment') do |span|
|
43
|
+
model.train_with_text(span.inner_text)
|
44
|
+
end
|
43
45
|
end
|
44
|
-
end
|
45
46
|
|
46
47
|
* Update a trained model:
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
49
|
+
model.train_with_text %{Interesting videos. Anders talks about
|
50
|
+
functional support on .net, concurrency, immutability. Guy Steele
|
51
|
+
talks about Fortress on JVM. Too bad they are afraid of macros
|
52
|
+
(access to AST), though Steele does say Fortress has some support.}
|
53
|
+
|
54
|
+
model.refresh
|
54
55
|
|
55
56
|
* Generate a random sentence:
|
56
57
|
|
57
|
-
|
58
|
-
|
59
|
-
|
58
|
+
model.random_sentence
|
59
|
+
# => "OTOOH if you use slicehost even offer to bash Apple makes it will
|
60
|
+
exit and its 38 month ago based configuration of little networks
|
61
|
+
created."
|
60
62
|
|
61
63
|
* Dump a model to a file, to be marshaled later:
|
62
64
|
|
63
|
-
|
65
|
+
model.save('path/for/model')
|
64
66
|
|
65
67
|
* Load a model from a file:
|
66
68
|
|
67
|
-
|
69
|
+
Model.open('path/for/model')
|
68
70
|
|
69
71
|
== LICENSE:
|
70
72
|
|
71
73
|
The MIT License
|
72
74
|
|
73
|
-
Copyright (c) 2007-
|
75
|
+
Copyright (c) 2007-2009 Hal Brodigan
|
74
76
|
|
75
77
|
Permission is hereby granted, free of charge, to any person obtaining
|
76
78
|
a copy of this software and associated documentation files (the
|
data/Rakefile
CHANGED
@@ -7,9 +7,9 @@ require './lib/raingrams/version.rb'
|
|
7
7
|
|
8
8
|
Hoe.new('raingrams', Raingrams::VERSION) do |p|
|
9
9
|
p.rubyforge_name = 'raingrams'
|
10
|
-
p.developer('Postmodern
|
10
|
+
p.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
11
11
|
p.remote_rdoc_dir = 'docs'
|
12
|
-
p.extra_deps = ['
|
12
|
+
p.extra_deps = [['nokogiri', '>=1.2.0']]
|
13
13
|
end
|
14
14
|
|
15
15
|
# vim: syntax=Ruby
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'raingrams/helpers/probability'
|
2
|
+
|
3
|
+
module Raingrams
|
4
|
+
module Helpers
|
5
|
+
module Commonality
|
6
|
+
def self.included(base)
|
7
|
+
base.module_eval { include Raingrams::Helpers::Probability }
|
8
|
+
end
|
9
|
+
|
10
|
+
#
|
11
|
+
# Returns the ngrams which occur within the specified _words_ and
|
12
|
+
# within the model.
|
13
|
+
#
|
14
|
+
def common_ngrams_from_words(words)
|
15
|
+
ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Returns the ngrams which occur within the specified _fragment_ and
|
20
|
+
# within the model.
|
21
|
+
#
|
22
|
+
def common_ngrams_from_fragment(fragment)
|
23
|
+
ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Returns the ngrams which occur within the specified _sentence_ and
|
28
|
+
# within the model.
|
29
|
+
#
|
30
|
+
def common_ngrams_from_sentence(sentence)
|
31
|
+
ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Returns the ngrams which occur within the specified _text_ and
|
36
|
+
# within the model.
|
37
|
+
#
|
38
|
+
def common_ngrams_from_text(text)
|
39
|
+
ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Returns the joint probability of the common ngrams between the
|
44
|
+
# specified _fragment_ and the model.
|
45
|
+
#
|
46
|
+
def fragment_commonality(fragment)
|
47
|
+
probability_of_ngrams(common_ngrams_from_fragment(fragment))
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Returns the joint probability of the common ngrams between the
|
52
|
+
# specified _sentence_ and the model.
|
53
|
+
#
|
54
|
+
def sentence_commonality(sentence)
|
55
|
+
probability_of_ngrams(common_ngrams_from_sentence(sentence))
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Returns the joint probability of the common ngrams between the
|
60
|
+
# specified _sentence_ and the model.
|
61
|
+
#
|
62
|
+
def text_commonality(text)
|
63
|
+
probability_of_ngrams(common_ngrams_from_text(text))
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Raingrams
|
2
|
+
module Helpers
|
3
|
+
module Frequency
|
4
|
+
#
|
5
|
+
# Returns the observed frequency of the specified _ngram_ within
|
6
|
+
# the training text.
|
7
|
+
#
|
8
|
+
def frequency_of_ngram(ngram)
|
9
|
+
prefix = ngram.prefix
|
10
|
+
|
11
|
+
if @prefixes.has_key?(prefix)
|
12
|
+
return @prefixes[prefix].frequency_of(ngram.last)
|
13
|
+
else
|
14
|
+
return 0
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Returns the observed frequency of the specified _ngrams_ occurring
|
20
|
+
# within the training text.
|
21
|
+
#
|
22
|
+
def frequencies_for(ngrams)
|
23
|
+
table = {}
|
24
|
+
|
25
|
+
ngrams.each do |ngram|
|
26
|
+
table[ngram] = frequency_of_ngram(ngram)
|
27
|
+
end
|
28
|
+
|
29
|
+
return table
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Returns the total observed frequency of the specified _ngrams_
|
34
|
+
# occurring within the training text.
|
35
|
+
#
|
36
|
+
def frequency_of_ngrams(ngrams)
|
37
|
+
frequencies_for(ngrams).values.inject do |total,freq|
|
38
|
+
total + freq
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Raingrams
|
2
|
+
module Helpers
|
3
|
+
module Probability
|
4
|
+
#
|
5
|
+
# Returns the probability of the specified _ngram_ occurring within
|
6
|
+
# arbitrary text.
|
7
|
+
#
|
8
|
+
def probability_of_ngram(ngram)
|
9
|
+
prefix = ngram.prefix
|
10
|
+
|
11
|
+
if @prefixes.has_key?(prefix)
|
12
|
+
return @prefixes[prefix].probability_of(ngram.last)
|
13
|
+
else
|
14
|
+
return 0.0
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Returns the probability of the specified _ngrams_ occurring within
|
20
|
+
# arbitrary text.
|
21
|
+
#
|
22
|
+
def probabilities_for(ngrams)
|
23
|
+
table = {}
|
24
|
+
|
25
|
+
ngrams.each do |ngram|
|
26
|
+
table[ngram] = probability_of_ngram(ngram)
|
27
|
+
end
|
28
|
+
|
29
|
+
return table
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Returns the joint probability of the specified _ngrams_ occurring
|
34
|
+
# within arbitrary text.
|
35
|
+
#
|
36
|
+
def probability_of_ngrams(ngrams)
|
37
|
+
probabilities_for(ngrams).values.inject do |joint,prob|
|
38
|
+
joint * prob
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Returns the probability of the specified _fragment_ occuring within
|
44
|
+
# arbitrary text.
|
45
|
+
#
|
46
|
+
def fragment_probability(fragment)
|
47
|
+
probability_of_ngrams(ngrams_from_fragment(fragment))
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Returns the probability of the specified _sentence_ occuring within
|
52
|
+
# arbitrary text.
|
53
|
+
#
|
54
|
+
def sentence_probability(sentence)
|
55
|
+
probability_of_ngrams(ngrams_from_sentence(sentence))
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Returns the probability of the specified _text_ occuring within
|
60
|
+
# arbitrary text.
|
61
|
+
#
|
62
|
+
def text_probability(text)
|
63
|
+
probability_of_ngrams(ngrams_from_text(text))
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
module Raingrams
|
2
|
+
module Helpers
|
3
|
+
module Random
|
4
|
+
#
|
5
|
+
# Returns a random gram from the model.
|
6
|
+
#
|
7
|
+
def random_gram
|
8
|
+
prefix = @prefixes.keys[rand(@prefixes.length)]
|
9
|
+
|
10
|
+
return prefix[rand(prefix.length)]
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Returns a random ngram from the model.
|
15
|
+
#
|
16
|
+
def random_ngram
|
17
|
+
prefix_index = rand(@prefixes.length)
|
18
|
+
|
19
|
+
prefix = @prefixes.keys[prefix_index]
|
20
|
+
table = @prefixes.values[prefix_index]
|
21
|
+
|
22
|
+
gram_index = rand(table.grams.length)
|
23
|
+
|
24
|
+
return (prefix + table.grams[gram_index])
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Returns a randomly generated sentence of grams using the given
|
29
|
+
# _options_.
|
30
|
+
#
|
31
|
+
def random_gram_sentence(options={})
|
32
|
+
grams = []
|
33
|
+
last_ngram = @starting_ngram
|
34
|
+
|
35
|
+
loop do
|
36
|
+
next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
|
37
|
+
last_ngram = next_ngrams[rand(next_ngrams.length)]
|
38
|
+
|
39
|
+
if last_ngram.nil?
|
40
|
+
return []
|
41
|
+
else
|
42
|
+
last_gram = last_ngram.last
|
43
|
+
|
44
|
+
break if last_gram == Tokens.stop
|
45
|
+
|
46
|
+
grams << last_gram
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
return grams
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# Returns a randomly generated sentence of text using the given
|
55
|
+
# _options_.
|
56
|
+
#
|
57
|
+
def random_sentence(options={})
|
58
|
+
grams = random_gram_sentence(options)
|
59
|
+
sentence = grams.delete_if { |gram|
|
60
|
+
gram == Tokens.start || gram == Tokens.stop
|
61
|
+
}.join(' ')
|
62
|
+
|
63
|
+
if @ignore_case
|
64
|
+
sentence.capitalize!
|
65
|
+
end
|
66
|
+
|
67
|
+
if @ignore_punctuation
|
68
|
+
sentence << '.'
|
69
|
+
end
|
70
|
+
|
71
|
+
return sentence
|
72
|
+
end
|
73
|
+
|
74
|
+
#
|
75
|
+
# Returns a randomly generated paragraph of text using the given
|
76
|
+
# _options_.
|
77
|
+
#
|
78
|
+
# _options_ may contain the following keys:
|
79
|
+
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
80
|
+
# paragraph. Defaults to 3.
|
81
|
+
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
82
|
+
# paragraph. Defaults to 6.
|
83
|
+
#
|
84
|
+
def random_paragraph(options={})
|
85
|
+
min_sentences = (options[:min_sentences] || 3)
|
86
|
+
max_sentences = (options[:max_sentences] || 6)
|
87
|
+
sentences = []
|
88
|
+
|
89
|
+
(rand(max_sentences - min_sentences) + min_sentences).times do
|
90
|
+
sentences << random_sentence(options)
|
91
|
+
end
|
92
|
+
|
93
|
+
return sentences.join(' ')
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# Returns randomly generated text using the given _options_.
|
98
|
+
#
|
99
|
+
# _options_ may contain the following keys:
|
100
|
+
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
101
|
+
# paragraph. Defaults to 3.
|
102
|
+
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
103
|
+
# paragraph. Defaults to 6.
|
104
|
+
# <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
|
105
|
+
# Defaults to 3.
|
106
|
+
# <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
|
107
|
+
# Defaults to 5.
|
108
|
+
#
|
109
|
+
def random_text(options={})
|
110
|
+
min_paragraphs = (options[:min_paragraphs] || 3)
|
111
|
+
max_paragraphs = (options[:max_paragraphs] || 6)
|
112
|
+
paragraphs = []
|
113
|
+
|
114
|
+
(rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
|
115
|
+
paragraphs << random_paragraph(options)
|
116
|
+
end
|
117
|
+
|
118
|
+
return paragraphs.join("\n\n")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'raingrams/helpers/commonality'
|
2
|
+
|
3
|
+
module Raingrams
|
4
|
+
module Helpers
|
5
|
+
module Similarity
|
6
|
+
def self.included(base)
|
7
|
+
base.module_eval { include Raingrams::Helpers::Commonality }
|
8
|
+
end
|
9
|
+
|
10
|
+
#
|
11
|
+
# Returns the conditional probability of the commonality of the
|
12
|
+
# specified _fragment_ against the _other_model_, given the
|
13
|
+
# commonality of the _fragment_ against the model.
|
14
|
+
#
|
15
|
+
def fragment_similarity(fragment,other_model)
|
16
|
+
other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Returns the conditional probability of the commonality of the
|
21
|
+
# specified _sentence_ against the _other_model_, given the
|
22
|
+
# commonality of the _sentence_ against the model.
|
23
|
+
#
|
24
|
+
def sentence_similarity(sentence,other_model)
|
25
|
+
other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Returns the conditional probability of the commonality of the
|
30
|
+
# specified _text_ against the _other_model_, given the commonality
|
31
|
+
# of the _text_ against the model.
|
32
|
+
#
|
33
|
+
def text_similarity(text,other_model)
|
34
|
+
other_model.text_commonality(text) / text_commonality(text)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/raingrams/model.rb
CHANGED
@@ -1,15 +1,22 @@
|
|
1
1
|
require 'raingrams/ngram'
|
2
2
|
require 'raingrams/ngram_set'
|
3
|
-
require 'raingrams/probability_table'
|
4
3
|
require 'raingrams/tokens'
|
4
|
+
require 'raingrams/probability_table'
|
5
|
+
require 'raingrams/helpers'
|
5
6
|
|
6
7
|
require 'set'
|
7
|
-
require '
|
8
|
+
require 'nokogiri'
|
8
9
|
require 'open-uri'
|
9
10
|
|
10
11
|
module Raingrams
|
11
12
|
class Model
|
12
13
|
|
14
|
+
include Helpers::Frequency
|
15
|
+
include Helpers::Probability
|
16
|
+
include Helpers::Similarity
|
17
|
+
include Helpers::Commonality
|
18
|
+
include Helpers::Random
|
19
|
+
|
13
20
|
# Size of ngrams to use
|
14
21
|
attr_reader :ngram_size
|
15
22
|
|
@@ -161,8 +168,12 @@ module Raingrams
|
|
161
168
|
sentence.gsub!(/[\.\?!]*$/,'')
|
162
169
|
end
|
163
170
|
|
171
|
+
if @ignore_case
|
172
|
+
# downcase the sentence
|
173
|
+
sentence.downcase!
|
174
|
+
end
|
175
|
+
|
164
176
|
if @ignore_urls
|
165
|
-
# remove URLs
|
166
177
|
sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
|
167
178
|
end
|
168
179
|
|
@@ -176,11 +187,6 @@ module Raingrams
|
|
176
187
|
sentence.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
|
177
188
|
end
|
178
189
|
|
179
|
-
if @ignore_case
|
180
|
-
# downcase the sentence
|
181
|
-
sentence.downcase!
|
182
|
-
end
|
183
|
-
|
184
190
|
if @ignore_punctuation
|
185
191
|
# split and ignore punctuation characters
|
186
192
|
return sentence.scan(/\w+[\-_\.:']\w+|\w+/)
|
@@ -194,7 +200,13 @@ module Raingrams
|
|
194
200
|
# Parses the specified _text_ and returns an Array of sentences.
|
195
201
|
#
|
196
202
|
def parse_text(text)
|
197
|
-
text.to_s
|
203
|
+
text = text.to_s
|
204
|
+
|
205
|
+
if @ignore_urls
|
206
|
+
text.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
|
207
|
+
end
|
208
|
+
|
209
|
+
return text.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/)
|
198
210
|
end
|
199
211
|
|
200
212
|
#
|
@@ -460,38 +472,6 @@ module Raingrams
|
|
460
472
|
return gram_set
|
461
473
|
end
|
462
474
|
|
463
|
-
#
|
464
|
-
# Returns the ngrams which occur within the specified _words_ and
|
465
|
-
# within the model.
|
466
|
-
#
|
467
|
-
def common_ngrams_from_words(words)
|
468
|
-
ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
|
469
|
-
end
|
470
|
-
|
471
|
-
#
|
472
|
-
# Returns the ngrams which occur within the specified _fragment_ and
|
473
|
-
# within the model.
|
474
|
-
#
|
475
|
-
def common_ngrams_from_fragment(fragment)
|
476
|
-
ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
|
477
|
-
end
|
478
|
-
|
479
|
-
#
|
480
|
-
# Returns the ngrams which occur within the specified _sentence_ and
|
481
|
-
# within the model.
|
482
|
-
#
|
483
|
-
def common_ngrams_from_sentence(sentence)
|
484
|
-
ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
|
485
|
-
end
|
486
|
-
|
487
|
-
#
|
488
|
-
# Returns the ngrams which occur within the specified _text_ and
|
489
|
-
# within the model.
|
490
|
-
#
|
491
|
-
def common_ngrams_from_text(text)
|
492
|
-
ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
|
493
|
-
end
|
494
|
-
|
495
475
|
#
|
496
476
|
# Sets the frequency of the specified _ngram_ to the specified _value_.
|
497
477
|
#
|
@@ -524,7 +504,7 @@ module Raingrams
|
|
524
504
|
# Train the model with the specified _paragraphs_.
|
525
505
|
#
|
526
506
|
def train_with_paragraph(paragraph)
|
527
|
-
train_with_ngrams(ngrams_from_paragraph(
|
507
|
+
train_with_ngrams(ngrams_from_paragraph(paragraph))
|
528
508
|
end
|
529
509
|
|
530
510
|
#
|
@@ -546,274 +526,13 @@ module Raingrams
|
|
546
526
|
# specified _url_.
|
547
527
|
#
|
548
528
|
def train_with_url(url)
|
549
|
-
doc =
|
529
|
+
doc = Nokogiri::HTML(open(url))
|
550
530
|
|
551
531
|
return doc.search('p').map do |p|
|
552
532
|
train_with_paragraph(p.inner_text)
|
553
533
|
end
|
554
534
|
end
|
555
535
|
|
556
|
-
#
|
557
|
-
# Returns the observed frequency of the specified _ngram_ within
|
558
|
-
# the training text.
|
559
|
-
#
|
560
|
-
def frequency_of_ngram(ngram)
|
561
|
-
prefix = ngram.prefix
|
562
|
-
|
563
|
-
if @prefixes.has_key?(prefix)
|
564
|
-
return @prefixes[prefix].frequency_of(ngram.last)
|
565
|
-
else
|
566
|
-
return 0
|
567
|
-
end
|
568
|
-
end
|
569
|
-
|
570
|
-
#
|
571
|
-
# Returns the probability of the specified _ngram_ occurring within
|
572
|
-
# arbitrary text.
|
573
|
-
#
|
574
|
-
def probability_of_ngram(ngram)
|
575
|
-
prefix = ngram.prefix
|
576
|
-
|
577
|
-
if @prefixes.has_key?(prefix)
|
578
|
-
return @prefixes[prefix].probability_of(ngram.last)
|
579
|
-
else
|
580
|
-
return 0.0
|
581
|
-
end
|
582
|
-
end
|
583
|
-
|
584
|
-
#
|
585
|
-
# Returns the observed frequency of the specified _ngrams_ occurring
|
586
|
-
# within the training text.
|
587
|
-
#
|
588
|
-
def frequencies_for(ngrams)
|
589
|
-
table = {}
|
590
|
-
|
591
|
-
ngrams.each do |ngram|
|
592
|
-
table[ngram] = frequency_of_ngram(ngram)
|
593
|
-
end
|
594
|
-
|
595
|
-
return table
|
596
|
-
end
|
597
|
-
|
598
|
-
#
|
599
|
-
# Returns the probability of the specified _ngrams_ occurring within
|
600
|
-
# arbitrary text.
|
601
|
-
#
|
602
|
-
def probabilities_for(ngrams)
|
603
|
-
table = {}
|
604
|
-
|
605
|
-
ngrams.each do |ngram|
|
606
|
-
table[ngram] = probability_of_ngram(ngram)
|
607
|
-
end
|
608
|
-
|
609
|
-
return table
|
610
|
-
end
|
611
|
-
|
612
|
-
#
|
613
|
-
# Returns the total observed frequency of the specified _ngrams_
|
614
|
-
# occurring within the training text.
|
615
|
-
#
|
616
|
-
def frequency_of_ngrams(ngrams)
|
617
|
-
frequencies_for(ngrams).values.inject do |total,freq|
|
618
|
-
total + freq
|
619
|
-
end
|
620
|
-
end
|
621
|
-
|
622
|
-
#
|
623
|
-
# Returns the joint probability of the specified _ngrams_ occurring
|
624
|
-
# within arbitrary text.
|
625
|
-
#
|
626
|
-
def probability_of_ngrams(ngrams)
|
627
|
-
probabilities_for(ngrams).values.inject do |joint,prob|
|
628
|
-
joint * prob
|
629
|
-
end
|
630
|
-
end
|
631
|
-
|
632
|
-
#
|
633
|
-
# Returns the probability of the specified _fragment_ occuring within
|
634
|
-
# arbitrary text.
|
635
|
-
#
|
636
|
-
def fragment_probability(fragment)
|
637
|
-
probability_of_ngrams(ngrams_from_fragment(fragment))
|
638
|
-
end
|
639
|
-
|
640
|
-
#
|
641
|
-
# Returns the probability of the specified _sentence_ occuring within
|
642
|
-
# arbitrary text.
|
643
|
-
#
|
644
|
-
def sentence_probability(sentence)
|
645
|
-
probability_of_ngrams(ngrams_from_sentence(sentence))
|
646
|
-
end
|
647
|
-
|
648
|
-
#
|
649
|
-
# Returns the probability of the specified _text_ occuring within
|
650
|
-
# arbitrary text.
|
651
|
-
#
|
652
|
-
def text_probability(text)
|
653
|
-
probability_of_ngrams(ngrams_from_text(text))
|
654
|
-
end
|
655
|
-
|
656
|
-
#
|
657
|
-
# Returns the joint probability of the common ngrams between the
|
658
|
-
# specified _fragment_ and the model.
|
659
|
-
#
|
660
|
-
def fragment_commonality(fragment)
|
661
|
-
probability_of_ngrams(common_ngrams_from_fragment(fragment))
|
662
|
-
end
|
663
|
-
|
664
|
-
#
|
665
|
-
# Returns the joint probability of the common ngrams between the
|
666
|
-
# specified _sentence_ and the model.
|
667
|
-
#
|
668
|
-
def sentence_commonality(sentence)
|
669
|
-
probability_of_ngrams(common_ngrams_from_sentence(sentence))
|
670
|
-
end
|
671
|
-
|
672
|
-
#
|
673
|
-
# Returns the joint probability of the common ngrams between the
|
674
|
-
# specified _sentence_ and the model.
|
675
|
-
#
|
676
|
-
def text_commonality(text)
|
677
|
-
probability_of_ngrams(common_ngrams_from_text(text))
|
678
|
-
end
|
679
|
-
|
680
|
-
#
|
681
|
-
# Returns the conditional probability of the commonality of the
|
682
|
-
# specified _fragment_ against the _other_model_, given the commonality
|
683
|
-
# of the _fragment_ against the model.
|
684
|
-
#
|
685
|
-
def fragment_similarity(fragment,other_model)
|
686
|
-
other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
|
687
|
-
end
|
688
|
-
|
689
|
-
#
|
690
|
-
# Returns the conditional probability of the commonality of the
|
691
|
-
# specified _sentence_ against the _other_model_, given the commonality
|
692
|
-
# of the _sentence_ against the model.
|
693
|
-
#
|
694
|
-
def sentence_similarity(sentence,other_model)
|
695
|
-
other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
|
696
|
-
end
|
697
|
-
|
698
|
-
#
|
699
|
-
# Returns the conditional probability of the commonality of the
|
700
|
-
# specified _text_ against the _other_model_, given the commonality
|
701
|
-
# of the _text_ against the model.
|
702
|
-
#
|
703
|
-
def text_similarity(text,other_model)
|
704
|
-
other_model.text_commonality(text) / text_commonality(text)
|
705
|
-
end
|
706
|
-
|
707
|
-
#
|
708
|
-
# Returns a random gram from the model.
|
709
|
-
#
|
710
|
-
def random_gram
|
711
|
-
prefix = @prefixes.keys[rand(@prefixes.length)]
|
712
|
-
|
713
|
-
return prefix[rand(prefix.length)]
|
714
|
-
end
|
715
|
-
|
716
|
-
#
|
717
|
-
# Returns a random ngram from the model.
|
718
|
-
#
|
719
|
-
def random_ngram
|
720
|
-
prefix_index = rand(@prefixes.length)
|
721
|
-
|
722
|
-
prefix = @prefixes.keys[prefix_index]
|
723
|
-
table = @prefixes.values[prefix_index]
|
724
|
-
|
725
|
-
gram_index = rand(table.grams.length)
|
726
|
-
|
727
|
-
return (prefix + table.grams[gram_index])
|
728
|
-
end
|
729
|
-
|
730
|
-
#
|
731
|
-
# Returns a randomly generated sentence of grams using the given
|
732
|
-
# _options_.
|
733
|
-
#
|
734
|
-
def random_gram_sentence(options={})
|
735
|
-
grams = []
|
736
|
-
last_ngram = @starting_ngram
|
737
|
-
|
738
|
-
loop do
|
739
|
-
next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
|
740
|
-
last_ngram = next_ngrams[rand(next_ngrams.length)]
|
741
|
-
|
742
|
-
if last_ngram.nil?
|
743
|
-
return []
|
744
|
-
else
|
745
|
-
last_gram = last_ngram.last
|
746
|
-
|
747
|
-
break if last_gram == Tokens.stop
|
748
|
-
|
749
|
-
grams << last_gram
|
750
|
-
end
|
751
|
-
end
|
752
|
-
|
753
|
-
return grams
|
754
|
-
end
|
755
|
-
|
756
|
-
#
|
757
|
-
# Returns a randomly generated sentence of text using the given
|
758
|
-
# _options_.
|
759
|
-
#
|
760
|
-
def random_sentence(options={})
|
761
|
-
grams = random_gram_sentence(options)
|
762
|
-
sentence = grams.delete_if { |gram|
|
763
|
-
gram == Tokens.start || gram == Tokens.stop
|
764
|
-
}.join(' ')
|
765
|
-
|
766
|
-
sentence << '.' if @ignore_punctuation
|
767
|
-
return sentence
|
768
|
-
end
|
769
|
-
|
770
|
-
#
|
771
|
-
# Returns a randomly generated paragraph of text using the given
|
772
|
-
# _options_.
|
773
|
-
#
|
774
|
-
# _options_ may contain the following keys:
|
775
|
-
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
776
|
-
# paragraph. Defaults to 3.
|
777
|
-
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
778
|
-
# paragraph. Defaults to 6.
|
779
|
-
#
|
780
|
-
def random_paragraph(options={})
|
781
|
-
min_sentences = (options[:min_sentences] || 3)
|
782
|
-
max_sentences = (options[:max_sentences] || 6)
|
783
|
-
sentences = []
|
784
|
-
|
785
|
-
(rand(max_sentences - min_sentences) + min_sentences).times do
|
786
|
-
sentences << random_sentence(options)
|
787
|
-
end
|
788
|
-
|
789
|
-
return sentences.join(' ')
|
790
|
-
end
|
791
|
-
|
792
|
-
#
|
793
|
-
# Returns randomly generated text using the given _options_.
|
794
|
-
#
|
795
|
-
# _options_ may contain the following keys:
|
796
|
-
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
797
|
-
# paragraph. Defaults to 3.
|
798
|
-
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
799
|
-
# paragraph. Defaults to 6.
|
800
|
-
# <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
|
801
|
-
# Defaults to 3.
|
802
|
-
# <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
|
803
|
-
# Defaults to 5.
|
804
|
-
#
|
805
|
-
def random_text(options={})
|
806
|
-
min_paragraphs = (options[:min_paragraphs] || 3)
|
807
|
-
max_paragraphs = (options[:max_paragraphs] || 6)
|
808
|
-
paragraphs = []
|
809
|
-
|
810
|
-
(rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
|
811
|
-
paragraphs << random_paragraph(options)
|
812
|
-
end
|
813
|
-
|
814
|
-
return paragraphs.join("\n\n")
|
815
|
-
end
|
816
|
-
|
817
536
|
#
|
818
537
|
# Refreshes the probability tables of the model.
|
819
538
|
#
|
@@ -854,6 +573,13 @@ module Raingrams
|
|
854
573
|
return self
|
855
574
|
end
|
856
575
|
|
576
|
+
#
|
577
|
+
# Returns a Hash representation of the model.
|
578
|
+
#
|
579
|
+
def to_hash
|
580
|
+
@prefixes
|
581
|
+
end
|
582
|
+
|
857
583
|
protected
|
858
584
|
|
859
585
|
#
|
@@ -141,6 +141,15 @@ module Raingrams
|
|
141
141
|
return self
|
142
142
|
end
|
143
143
|
|
144
|
+
#
|
145
|
+
# Returns a Hash representation of the probability table.
|
146
|
+
#
|
147
|
+
def to_hash
|
148
|
+
build
|
149
|
+
|
150
|
+
return @probabilities
|
151
|
+
end
|
152
|
+
|
144
153
|
def inspect
|
145
154
|
if @dirty
|
146
155
|
"#<ProbabilityTable @total=#{@total} @frequencies=#{@frequencies.inspect}>"
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'raingrams/tokens/start_sentence'
|
2
|
+
require 'raingrams/tokens/stop_sentence'
|
3
|
+
require 'raingrams/tokens/unknown'
|
4
|
+
|
5
|
+
module Raingrams
|
6
|
+
module Tokens
|
7
|
+
#
|
8
|
+
# Returns all defined tokens.
|
9
|
+
#
|
10
|
+
def Tokens.all
|
11
|
+
@@raingram_tokens ||= {}
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Returns the start sentence token.
|
16
|
+
#
|
17
|
+
def Tokens.start
|
18
|
+
Tokens.all[:start] ||= StartSentence.new
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Returns the stop sentence token.
|
23
|
+
#
|
24
|
+
def Tokens.stop
|
25
|
+
Tokens.all[:stop] ||= StopSentence.new
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Returns the unknown word token.
|
30
|
+
#
|
31
|
+
def Tokens.unknown
|
32
|
+
Tokens.all[:unknown] ||= Unknown.new
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/raingrams/version.rb
CHANGED
data/tasks/spec.rb
CHANGED
metadata
CHANGED
@@ -1,26 +1,26 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: raingrams
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Postmodern
|
7
|
+
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-04-23 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 1.2.0
|
24
24
|
version:
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: hoe
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.12.2
|
34
34
|
version:
|
35
35
|
description: Raingrams is a flexible and general-purpose ngrams library written in Ruby. Raingrams supports ngram sizes greater than 1, text/non-text grams, multiple parsing styles and open/closed vocabulary models.
|
36
36
|
email:
|
@@ -45,7 +45,6 @@ extra_rdoc_files:
|
|
45
45
|
- Manifest.txt
|
46
46
|
- README.txt
|
47
47
|
- TODO.txt
|
48
|
-
- spec/training/snowcrash.txt
|
49
48
|
files:
|
50
49
|
- History.txt
|
51
50
|
- LICENSE.txt
|
@@ -54,27 +53,33 @@ files:
|
|
54
53
|
- TODO.txt
|
55
54
|
- Rakefile
|
56
55
|
- lib/raingrams.rb
|
57
|
-
- lib/raingrams/version.rb
|
58
|
-
- lib/raingrams/raingrams.rb
|
59
|
-
- lib/raingrams/exceptions/prefix_frequency_missing.rb
|
60
56
|
- lib/raingrams/exceptions.rb
|
57
|
+
- lib/raingrams/exceptions/prefix_frequency_missing.rb
|
58
|
+
- lib/raingrams/extensions.rb
|
61
59
|
- lib/raingrams/extensions/object.rb
|
62
60
|
- lib/raingrams/extensions/string.rb
|
63
|
-
- lib/raingrams/
|
61
|
+
- lib/raingrams/tokens.rb
|
64
62
|
- lib/raingrams/tokens/token.rb
|
65
63
|
- lib/raingrams/tokens/start_sentence.rb
|
66
64
|
- lib/raingrams/tokens/stop_sentence.rb
|
67
65
|
- lib/raingrams/tokens/unknown.rb
|
68
|
-
- lib/raingrams/tokens.rb
|
66
|
+
- lib/raingrams/tokens/tokens.rb
|
69
67
|
- lib/raingrams/ngram.rb
|
70
68
|
- lib/raingrams/ngram_set.rb
|
71
69
|
- lib/raingrams/probability_table.rb
|
70
|
+
- lib/raingrams/helpers.rb
|
71
|
+
- lib/raingrams/helpers/frequency.rb
|
72
|
+
- lib/raingrams/helpers/probability.rb
|
73
|
+
- lib/raingrams/helpers/similarity.rb
|
74
|
+
- lib/raingrams/helpers/commonality.rb
|
75
|
+
- lib/raingrams/helpers/random.rb
|
72
76
|
- lib/raingrams/model.rb
|
73
77
|
- lib/raingrams/bigram_model.rb
|
74
78
|
- lib/raingrams/trigram_model.rb
|
75
79
|
- lib/raingrams/quadgram_model.rb
|
76
80
|
- lib/raingrams/pentagram_model.rb
|
77
81
|
- lib/raingrams/hexagram_model.rb
|
82
|
+
- lib/raingrams/open_vocabulary.rb
|
78
83
|
- lib/raingrams/open_vocabulary/open_model.rb
|
79
84
|
- lib/raingrams/open_vocabulary/model.rb
|
80
85
|
- lib/raingrams/open_vocabulary/bigram_model.rb
|
@@ -82,7 +87,8 @@ files:
|
|
82
87
|
- lib/raingrams/open_vocabulary/quadgram_model.rb
|
83
88
|
- lib/raingrams/open_vocabulary/pentagram_model.rb
|
84
89
|
- lib/raingrams/open_vocabulary/hexagram_model.rb
|
85
|
-
- lib/raingrams/
|
90
|
+
- lib/raingrams/version.rb
|
91
|
+
- lib/raingrams/raingrams.rb
|
86
92
|
- tasks/spec.rb
|
87
93
|
- spec/training/snowcrash.txt
|
88
94
|
- spec/helpers/training.rb
|
@@ -121,7 +127,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
121
127
|
requirements: []
|
122
128
|
|
123
129
|
rubyforge_project: raingrams
|
124
|
-
rubygems_version: 1.3.
|
130
|
+
rubygems_version: 1.3.1
|
125
131
|
signing_key:
|
126
132
|
specification_version: 2
|
127
133
|
summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
|