raingrams 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +22 -3
- data/LICENSE.txt +1 -1
- data/Manifest.txt +13 -6
- data/README.txt +27 -25
- data/Rakefile +2 -2
- data/lib/raingrams/helpers.rb +5 -0
- data/lib/raingrams/helpers/commonality.rb +67 -0
- data/lib/raingrams/helpers/frequency.rb +43 -0
- data/lib/raingrams/helpers/probability.rb +67 -0
- data/lib/raingrams/helpers/random.rb +122 -0
- data/lib/raingrams/helpers/similarity.rb +38 -0
- data/lib/raingrams/model.rb +30 -304
- data/lib/raingrams/probability_table.rb +9 -0
- data/lib/raingrams/tokens/tokens.rb +35 -0
- data/lib/raingrams/version.rb +1 -1
- data/tasks/spec.rb +2 -0
- metadata +20 -14
data/History.txt
CHANGED
@@ -1,4 +1,23 @@
|
|
1
|
-
|
1
|
+
=== 0.1.2 / 2009-04-23
|
2
|
+
|
3
|
+
* Require nokogiri >= 1.2.0.
|
4
|
+
* No longer require hpricot.
|
5
|
+
* Added missing 'lib/raingrams/tokens/tokens.rb' file to the Manifest.
|
6
|
+
* Added Raingrams::Helpers:
|
7
|
+
* Moved text commonality calculating methods into
|
8
|
+
Raingrams::Helpers::Commonality.
|
9
|
+
* Moved text frequency calculating methods into
|
10
|
+
Raingrams::Helpers::Frequency.
|
11
|
+
* Moved text probability calculating methods into
|
12
|
+
Raingrams::Helpers::Probability.
|
13
|
+
* Moved random text generating methods into
|
14
|
+
Raingrams::Helpers::Random.
|
15
|
+
* Moved text similarity calculating methods into
|
16
|
+
Raingrams::Helpers::Similarity.
|
17
|
+
* Added Model#to_hash.
|
18
|
+
* Capitalize randomly generated sentences if case is ignored.
|
19
|
+
|
20
|
+
=== 0.1.1 / 2008-10-12
|
2
21
|
|
3
22
|
* Improved the parsing abilities of Model#parse_sentence and
|
4
23
|
Model#parse_text.
|
@@ -26,7 +45,7 @@
|
|
26
45
|
* Model#frequencies_of_ngrams.
|
27
46
|
* Model#save.
|
28
47
|
|
29
|
-
|
48
|
+
=== 0.1.0 / 2008-10-06
|
30
49
|
|
31
50
|
* Various bug fixes.
|
32
51
|
* Added NgramSet and ProbabilityTable classes.
|
@@ -35,7 +54,7 @@
|
|
35
54
|
* Added random_gram_sentence, random_sentence, random_paragraph and
|
36
55
|
random_text methods to the Model class.
|
37
56
|
|
38
|
-
|
57
|
+
=== 0.0.9 / 2008-01-09
|
39
58
|
|
40
59
|
* Initial release.
|
41
60
|
* Supports all non-zero ngram sizes.
|
data/LICENSE.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -5,27 +5,33 @@ README.txt
|
|
5
5
|
TODO.txt
|
6
6
|
Rakefile
|
7
7
|
lib/raingrams.rb
|
8
|
-
lib/raingrams/version.rb
|
9
|
-
lib/raingrams/raingrams.rb
|
10
|
-
lib/raingrams/exceptions/prefix_frequency_missing.rb
|
11
8
|
lib/raingrams/exceptions.rb
|
9
|
+
lib/raingrams/exceptions/prefix_frequency_missing.rb
|
10
|
+
lib/raingrams/extensions.rb
|
12
11
|
lib/raingrams/extensions/object.rb
|
13
12
|
lib/raingrams/extensions/string.rb
|
14
|
-
lib/raingrams/
|
13
|
+
lib/raingrams/tokens.rb
|
15
14
|
lib/raingrams/tokens/token.rb
|
16
15
|
lib/raingrams/tokens/start_sentence.rb
|
17
16
|
lib/raingrams/tokens/stop_sentence.rb
|
18
17
|
lib/raingrams/tokens/unknown.rb
|
19
|
-
lib/raingrams/tokens.rb
|
18
|
+
lib/raingrams/tokens/tokens.rb
|
20
19
|
lib/raingrams/ngram.rb
|
21
20
|
lib/raingrams/ngram_set.rb
|
22
21
|
lib/raingrams/probability_table.rb
|
22
|
+
lib/raingrams/helpers.rb
|
23
|
+
lib/raingrams/helpers/frequency.rb
|
24
|
+
lib/raingrams/helpers/probability.rb
|
25
|
+
lib/raingrams/helpers/similarity.rb
|
26
|
+
lib/raingrams/helpers/commonality.rb
|
27
|
+
lib/raingrams/helpers/random.rb
|
23
28
|
lib/raingrams/model.rb
|
24
29
|
lib/raingrams/bigram_model.rb
|
25
30
|
lib/raingrams/trigram_model.rb
|
26
31
|
lib/raingrams/quadgram_model.rb
|
27
32
|
lib/raingrams/pentagram_model.rb
|
28
33
|
lib/raingrams/hexagram_model.rb
|
34
|
+
lib/raingrams/open_vocabulary.rb
|
29
35
|
lib/raingrams/open_vocabulary/open_model.rb
|
30
36
|
lib/raingrams/open_vocabulary/model.rb
|
31
37
|
lib/raingrams/open_vocabulary/bigram_model.rb
|
@@ -33,7 +39,8 @@ lib/raingrams/open_vocabulary/trigram_model.rb
|
|
33
39
|
lib/raingrams/open_vocabulary/quadgram_model.rb
|
34
40
|
lib/raingrams/open_vocabulary/pentagram_model.rb
|
35
41
|
lib/raingrams/open_vocabulary/hexagram_model.rb
|
36
|
-
lib/raingrams/
|
42
|
+
lib/raingrams/version.rb
|
43
|
+
lib/raingrams/raingrams.rb
|
37
44
|
tasks/spec.rb
|
38
45
|
spec/training/snowcrash.txt
|
39
46
|
spec/helpers/training.rb
|
data/README.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
= Raingrams
|
2
2
|
|
3
3
|
* http://raingrams.rubyforge.org/
|
4
|
-
*
|
4
|
+
* http://github.com/postmodern/raingrams/
|
5
|
+
* Postmodern (postmodern.mod3 at gmail.com)
|
5
6
|
|
6
7
|
== DESCRIPTION:
|
7
8
|
|
@@ -20,7 +21,7 @@ parsing styles and open/closed vocabulary models.
|
|
20
21
|
|
21
22
|
== REQUIREMENTS:
|
22
23
|
|
23
|
-
*
|
24
|
+
* {nokogiri}[http://nokogiri.rubyforge.org/] >= 1.2.0
|
24
25
|
|
25
26
|
== INSTALL:
|
26
27
|
|
@@ -30,47 +31,48 @@ parsing styles and open/closed vocabulary models.
|
|
30
31
|
|
31
32
|
* Train a model with ycombinator comments:
|
32
33
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
34
|
+
require 'raingrams'
|
35
|
+
require 'nokogiri'
|
36
|
+
require 'open-uri'
|
37
|
+
|
38
|
+
include Raingrams
|
39
|
+
|
40
|
+
model = BigramModel.build do |model|
|
41
|
+
doc = Nokogiri::HTML(open('http://news.ycombinator.org/newcomments'))
|
42
|
+
doc.search('span.comment') do |span|
|
43
|
+
model.train_with_text(span.inner_text)
|
44
|
+
end
|
43
45
|
end
|
44
|
-
end
|
45
46
|
|
46
47
|
* Update a trained model:
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
49
|
+
model.train_with_text %{Interesting videos. Anders talks about
|
50
|
+
functional support on .net, concurrency, immutability. Guy Steele
|
51
|
+
talks about Fortress on JVM. Too bad they are afraid of macros
|
52
|
+
(access to AST), though Steele does say Fortress has some support.}
|
53
|
+
|
54
|
+
model.refresh
|
54
55
|
|
55
56
|
* Generate a random sentence:
|
56
57
|
|
57
|
-
|
58
|
-
|
59
|
-
|
58
|
+
model.random_sentence
|
59
|
+
# => "OTOOH if you use slicehost even offer to bash Apple makes it will
|
60
|
+
exit and its 38 month ago based configuration of little networks
|
61
|
+
created."
|
60
62
|
|
61
63
|
* Dump a model to a file, to be marshaled later:
|
62
64
|
|
63
|
-
|
65
|
+
model.save('path/for/model')
|
64
66
|
|
65
67
|
* Load a model from a file:
|
66
68
|
|
67
|
-
|
69
|
+
Model.open('path/for/model')
|
68
70
|
|
69
71
|
== LICENSE:
|
70
72
|
|
71
73
|
The MIT License
|
72
74
|
|
73
|
-
Copyright (c) 2007-
|
75
|
+
Copyright (c) 2007-2009 Hal Brodigan
|
74
76
|
|
75
77
|
Permission is hereby granted, free of charge, to any person obtaining
|
76
78
|
a copy of this software and associated documentation files (the
|
data/Rakefile
CHANGED
@@ -7,9 +7,9 @@ require './lib/raingrams/version.rb'
|
|
7
7
|
|
8
8
|
Hoe.new('raingrams', Raingrams::VERSION) do |p|
|
9
9
|
p.rubyforge_name = 'raingrams'
|
10
|
-
p.developer('Postmodern
|
10
|
+
p.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
11
11
|
p.remote_rdoc_dir = 'docs'
|
12
|
-
p.extra_deps = ['
|
12
|
+
p.extra_deps = [['nokogiri', '>=1.2.0']]
|
13
13
|
end
|
14
14
|
|
15
15
|
# vim: syntax=Ruby
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'raingrams/helpers/probability'
|
2
|
+
|
3
|
+
module Raingrams
|
4
|
+
module Helpers
|
5
|
+
module Commonality
|
6
|
+
def self.included(base)
|
7
|
+
base.module_eval { include Raingrams::Helpers::Probability }
|
8
|
+
end
|
9
|
+
|
10
|
+
#
|
11
|
+
# Returns the ngrams which occur within the specified _words_ and
|
12
|
+
# within the model.
|
13
|
+
#
|
14
|
+
def common_ngrams_from_words(words)
|
15
|
+
ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Returns the ngrams which occur within the specified _fragment_ and
|
20
|
+
# within the model.
|
21
|
+
#
|
22
|
+
def common_ngrams_from_fragment(fragment)
|
23
|
+
ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Returns the ngrams which occur within the specified _sentence_ and
|
28
|
+
# within the model.
|
29
|
+
#
|
30
|
+
def common_ngrams_from_sentence(sentence)
|
31
|
+
ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Returns the ngrams which occur within the specified _text_ and
|
36
|
+
# within the model.
|
37
|
+
#
|
38
|
+
def common_ngrams_from_text(text)
|
39
|
+
ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Returns the joint probability of the common ngrams between the
|
44
|
+
# specified _fragment_ and the model.
|
45
|
+
#
|
46
|
+
def fragment_commonality(fragment)
|
47
|
+
probability_of_ngrams(common_ngrams_from_fragment(fragment))
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Returns the joint probability of the common ngrams between the
|
52
|
+
# specified _sentence_ and the model.
|
53
|
+
#
|
54
|
+
def sentence_commonality(sentence)
|
55
|
+
probability_of_ngrams(common_ngrams_from_sentence(sentence))
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Returns the joint probability of the common ngrams between the
|
60
|
+
# specified _sentence_ and the model.
|
61
|
+
#
|
62
|
+
def text_commonality(text)
|
63
|
+
probability_of_ngrams(common_ngrams_from_text(text))
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Raingrams
|
2
|
+
module Helpers
|
3
|
+
module Frequency
|
4
|
+
#
|
5
|
+
# Returns the observed frequency of the specified _ngram_ within
|
6
|
+
# the training text.
|
7
|
+
#
|
8
|
+
def frequency_of_ngram(ngram)
|
9
|
+
prefix = ngram.prefix
|
10
|
+
|
11
|
+
if @prefixes.has_key?(prefix)
|
12
|
+
return @prefixes[prefix].frequency_of(ngram.last)
|
13
|
+
else
|
14
|
+
return 0
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Returns the observed frequency of the specified _ngrams_ occurring
|
20
|
+
# within the training text.
|
21
|
+
#
|
22
|
+
def frequencies_for(ngrams)
|
23
|
+
table = {}
|
24
|
+
|
25
|
+
ngrams.each do |ngram|
|
26
|
+
table[ngram] = frequency_of_ngram(ngram)
|
27
|
+
end
|
28
|
+
|
29
|
+
return table
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Returns the total observed frequency of the specified _ngrams_
|
34
|
+
# occurring within the training text.
|
35
|
+
#
|
36
|
+
def frequency_of_ngrams(ngrams)
|
37
|
+
frequencies_for(ngrams).values.inject do |total,freq|
|
38
|
+
total + freq
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Raingrams
|
2
|
+
module Helpers
|
3
|
+
module Probability
|
4
|
+
#
|
5
|
+
# Returns the probability of the specified _ngram_ occurring within
|
6
|
+
# arbitrary text.
|
7
|
+
#
|
8
|
+
def probability_of_ngram(ngram)
|
9
|
+
prefix = ngram.prefix
|
10
|
+
|
11
|
+
if @prefixes.has_key?(prefix)
|
12
|
+
return @prefixes[prefix].probability_of(ngram.last)
|
13
|
+
else
|
14
|
+
return 0.0
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Returns the probability of the specified _ngrams_ occurring within
|
20
|
+
# arbitrary text.
|
21
|
+
#
|
22
|
+
def probabilities_for(ngrams)
|
23
|
+
table = {}
|
24
|
+
|
25
|
+
ngrams.each do |ngram|
|
26
|
+
table[ngram] = probability_of_ngram(ngram)
|
27
|
+
end
|
28
|
+
|
29
|
+
return table
|
30
|
+
end
|
31
|
+
|
32
|
+
#
|
33
|
+
# Returns the joint probability of the specified _ngrams_ occurring
|
34
|
+
# within arbitrary text.
|
35
|
+
#
|
36
|
+
def probability_of_ngrams(ngrams)
|
37
|
+
probabilities_for(ngrams).values.inject do |joint,prob|
|
38
|
+
joint * prob
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Returns the probability of the specified _fragment_ occuring within
|
44
|
+
# arbitrary text.
|
45
|
+
#
|
46
|
+
def fragment_probability(fragment)
|
47
|
+
probability_of_ngrams(ngrams_from_fragment(fragment))
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Returns the probability of the specified _sentence_ occuring within
|
52
|
+
# arbitrary text.
|
53
|
+
#
|
54
|
+
def sentence_probability(sentence)
|
55
|
+
probability_of_ngrams(ngrams_from_sentence(sentence))
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Returns the probability of the specified _text_ occuring within
|
60
|
+
# arbitrary text.
|
61
|
+
#
|
62
|
+
def text_probability(text)
|
63
|
+
probability_of_ngrams(ngrams_from_text(text))
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
module Raingrams
|
2
|
+
module Helpers
|
3
|
+
module Random
|
4
|
+
#
|
5
|
+
# Returns a random gram from the model.
|
6
|
+
#
|
7
|
+
def random_gram
|
8
|
+
prefix = @prefixes.keys[rand(@prefixes.length)]
|
9
|
+
|
10
|
+
return prefix[rand(prefix.length)]
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Returns a random ngram from the model.
|
15
|
+
#
|
16
|
+
def random_ngram
|
17
|
+
prefix_index = rand(@prefixes.length)
|
18
|
+
|
19
|
+
prefix = @prefixes.keys[prefix_index]
|
20
|
+
table = @prefixes.values[prefix_index]
|
21
|
+
|
22
|
+
gram_index = rand(table.grams.length)
|
23
|
+
|
24
|
+
return (prefix + table.grams[gram_index])
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Returns a randomly generated sentence of grams using the given
|
29
|
+
# _options_.
|
30
|
+
#
|
31
|
+
def random_gram_sentence(options={})
|
32
|
+
grams = []
|
33
|
+
last_ngram = @starting_ngram
|
34
|
+
|
35
|
+
loop do
|
36
|
+
next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
|
37
|
+
last_ngram = next_ngrams[rand(next_ngrams.length)]
|
38
|
+
|
39
|
+
if last_ngram.nil?
|
40
|
+
return []
|
41
|
+
else
|
42
|
+
last_gram = last_ngram.last
|
43
|
+
|
44
|
+
break if last_gram == Tokens.stop
|
45
|
+
|
46
|
+
grams << last_gram
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
return grams
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# Returns a randomly generated sentence of text using the given
|
55
|
+
# _options_.
|
56
|
+
#
|
57
|
+
def random_sentence(options={})
|
58
|
+
grams = random_gram_sentence(options)
|
59
|
+
sentence = grams.delete_if { |gram|
|
60
|
+
gram == Tokens.start || gram == Tokens.stop
|
61
|
+
}.join(' ')
|
62
|
+
|
63
|
+
if @ignore_case
|
64
|
+
sentence.capitalize!
|
65
|
+
end
|
66
|
+
|
67
|
+
if @ignore_punctuation
|
68
|
+
sentence << '.'
|
69
|
+
end
|
70
|
+
|
71
|
+
return sentence
|
72
|
+
end
|
73
|
+
|
74
|
+
#
|
75
|
+
# Returns a randomly generated paragraph of text using the given
|
76
|
+
# _options_.
|
77
|
+
#
|
78
|
+
# _options_ may contain the following keys:
|
79
|
+
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
80
|
+
# paragraph. Defaults to 3.
|
81
|
+
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
82
|
+
# paragraph. Defaults to 6.
|
83
|
+
#
|
84
|
+
def random_paragraph(options={})
|
85
|
+
min_sentences = (options[:min_sentences] || 3)
|
86
|
+
max_sentences = (options[:max_sentences] || 6)
|
87
|
+
sentences = []
|
88
|
+
|
89
|
+
(rand(max_sentences - min_sentences) + min_sentences).times do
|
90
|
+
sentences << random_sentence(options)
|
91
|
+
end
|
92
|
+
|
93
|
+
return sentences.join(' ')
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# Returns randomly generated text using the given _options_.
|
98
|
+
#
|
99
|
+
# _options_ may contain the following keys:
|
100
|
+
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
101
|
+
# paragraph. Defaults to 3.
|
102
|
+
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
103
|
+
# paragraph. Defaults to 6.
|
104
|
+
# <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
|
105
|
+
# Defaults to 3.
|
106
|
+
# <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
|
107
|
+
# Defaults to 5.
|
108
|
+
#
|
109
|
+
def random_text(options={})
|
110
|
+
min_paragraphs = (options[:min_paragraphs] || 3)
|
111
|
+
max_paragraphs = (options[:max_paragraphs] || 6)
|
112
|
+
paragraphs = []
|
113
|
+
|
114
|
+
(rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
|
115
|
+
paragraphs << random_paragraph(options)
|
116
|
+
end
|
117
|
+
|
118
|
+
return paragraphs.join("\n\n")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'raingrams/helpers/commonality'
|
2
|
+
|
3
|
+
module Raingrams
|
4
|
+
module Helpers
|
5
|
+
module Similarity
|
6
|
+
def self.included(base)
|
7
|
+
base.module_eval { include Raingrams::Helpers::Commonality }
|
8
|
+
end
|
9
|
+
|
10
|
+
#
|
11
|
+
# Returns the conditional probability of the commonality of the
|
12
|
+
# specified _fragment_ against the _other_model_, given the
|
13
|
+
# commonality of the _fragment_ against the model.
|
14
|
+
#
|
15
|
+
def fragment_similarity(fragment,other_model)
|
16
|
+
other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Returns the conditional probability of the commonality of the
|
21
|
+
# specified _sentence_ against the _other_model_, given the
|
22
|
+
# commonality of the _sentence_ against the model.
|
23
|
+
#
|
24
|
+
def sentence_similarity(sentence,other_model)
|
25
|
+
other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Returns the conditional probability of the commonality of the
|
30
|
+
# specified _text_ against the _other_model_, given the commonality
|
31
|
+
# of the _text_ against the model.
|
32
|
+
#
|
33
|
+
def text_similarity(text,other_model)
|
34
|
+
other_model.text_commonality(text) / text_commonality(text)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/raingrams/model.rb
CHANGED
@@ -1,15 +1,22 @@
|
|
1
1
|
require 'raingrams/ngram'
|
2
2
|
require 'raingrams/ngram_set'
|
3
|
-
require 'raingrams/probability_table'
|
4
3
|
require 'raingrams/tokens'
|
4
|
+
require 'raingrams/probability_table'
|
5
|
+
require 'raingrams/helpers'
|
5
6
|
|
6
7
|
require 'set'
|
7
|
-
require '
|
8
|
+
require 'nokogiri'
|
8
9
|
require 'open-uri'
|
9
10
|
|
10
11
|
module Raingrams
|
11
12
|
class Model
|
12
13
|
|
14
|
+
include Helpers::Frequency
|
15
|
+
include Helpers::Probability
|
16
|
+
include Helpers::Similarity
|
17
|
+
include Helpers::Commonality
|
18
|
+
include Helpers::Random
|
19
|
+
|
13
20
|
# Size of ngrams to use
|
14
21
|
attr_reader :ngram_size
|
15
22
|
|
@@ -161,8 +168,12 @@ module Raingrams
|
|
161
168
|
sentence.gsub!(/[\.\?!]*$/,'')
|
162
169
|
end
|
163
170
|
|
171
|
+
if @ignore_case
|
172
|
+
# downcase the sentence
|
173
|
+
sentence.downcase!
|
174
|
+
end
|
175
|
+
|
164
176
|
if @ignore_urls
|
165
|
-
# remove URLs
|
166
177
|
sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
|
167
178
|
end
|
168
179
|
|
@@ -176,11 +187,6 @@ module Raingrams
|
|
176
187
|
sentence.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
|
177
188
|
end
|
178
189
|
|
179
|
-
if @ignore_case
|
180
|
-
# downcase the sentence
|
181
|
-
sentence.downcase!
|
182
|
-
end
|
183
|
-
|
184
190
|
if @ignore_punctuation
|
185
191
|
# split and ignore punctuation characters
|
186
192
|
return sentence.scan(/\w+[\-_\.:']\w+|\w+/)
|
@@ -194,7 +200,13 @@ module Raingrams
|
|
194
200
|
# Parses the specified _text_ and returns an Array of sentences.
|
195
201
|
#
|
196
202
|
def parse_text(text)
|
197
|
-
text.to_s
|
203
|
+
text = text.to_s
|
204
|
+
|
205
|
+
if @ignore_urls
|
206
|
+
text.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
|
207
|
+
end
|
208
|
+
|
209
|
+
return text.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/)
|
198
210
|
end
|
199
211
|
|
200
212
|
#
|
@@ -460,38 +472,6 @@ module Raingrams
|
|
460
472
|
return gram_set
|
461
473
|
end
|
462
474
|
|
463
|
-
#
|
464
|
-
# Returns the ngrams which occur within the specified _words_ and
|
465
|
-
# within the model.
|
466
|
-
#
|
467
|
-
def common_ngrams_from_words(words)
|
468
|
-
ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
|
469
|
-
end
|
470
|
-
|
471
|
-
#
|
472
|
-
# Returns the ngrams which occur within the specified _fragment_ and
|
473
|
-
# within the model.
|
474
|
-
#
|
475
|
-
def common_ngrams_from_fragment(fragment)
|
476
|
-
ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
|
477
|
-
end
|
478
|
-
|
479
|
-
#
|
480
|
-
# Returns the ngrams which occur within the specified _sentence_ and
|
481
|
-
# within the model.
|
482
|
-
#
|
483
|
-
def common_ngrams_from_sentence(sentence)
|
484
|
-
ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
|
485
|
-
end
|
486
|
-
|
487
|
-
#
|
488
|
-
# Returns the ngrams which occur within the specified _text_ and
|
489
|
-
# within the model.
|
490
|
-
#
|
491
|
-
def common_ngrams_from_text(text)
|
492
|
-
ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
|
493
|
-
end
|
494
|
-
|
495
475
|
#
|
496
476
|
# Sets the frequency of the specified _ngram_ to the specified _value_.
|
497
477
|
#
|
@@ -524,7 +504,7 @@ module Raingrams
|
|
524
504
|
# Train the model with the specified _paragraphs_.
|
525
505
|
#
|
526
506
|
def train_with_paragraph(paragraph)
|
527
|
-
train_with_ngrams(ngrams_from_paragraph(
|
507
|
+
train_with_ngrams(ngrams_from_paragraph(paragraph))
|
528
508
|
end
|
529
509
|
|
530
510
|
#
|
@@ -546,274 +526,13 @@ module Raingrams
|
|
546
526
|
# specified _url_.
|
547
527
|
#
|
548
528
|
def train_with_url(url)
|
549
|
-
doc =
|
529
|
+
doc = Nokogiri::HTML(open(url))
|
550
530
|
|
551
531
|
return doc.search('p').map do |p|
|
552
532
|
train_with_paragraph(p.inner_text)
|
553
533
|
end
|
554
534
|
end
|
555
535
|
|
556
|
-
#
|
557
|
-
# Returns the observed frequency of the specified _ngram_ within
|
558
|
-
# the training text.
|
559
|
-
#
|
560
|
-
def frequency_of_ngram(ngram)
|
561
|
-
prefix = ngram.prefix
|
562
|
-
|
563
|
-
if @prefixes.has_key?(prefix)
|
564
|
-
return @prefixes[prefix].frequency_of(ngram.last)
|
565
|
-
else
|
566
|
-
return 0
|
567
|
-
end
|
568
|
-
end
|
569
|
-
|
570
|
-
#
|
571
|
-
# Returns the probability of the specified _ngram_ occurring within
|
572
|
-
# arbitrary text.
|
573
|
-
#
|
574
|
-
def probability_of_ngram(ngram)
|
575
|
-
prefix = ngram.prefix
|
576
|
-
|
577
|
-
if @prefixes.has_key?(prefix)
|
578
|
-
return @prefixes[prefix].probability_of(ngram.last)
|
579
|
-
else
|
580
|
-
return 0.0
|
581
|
-
end
|
582
|
-
end
|
583
|
-
|
584
|
-
#
|
585
|
-
# Returns the observed frequency of the specified _ngrams_ occurring
|
586
|
-
# within the training text.
|
587
|
-
#
|
588
|
-
def frequencies_for(ngrams)
|
589
|
-
table = {}
|
590
|
-
|
591
|
-
ngrams.each do |ngram|
|
592
|
-
table[ngram] = frequency_of_ngram(ngram)
|
593
|
-
end
|
594
|
-
|
595
|
-
return table
|
596
|
-
end
|
597
|
-
|
598
|
-
#
|
599
|
-
# Returns the probability of the specified _ngrams_ occurring within
|
600
|
-
# arbitrary text.
|
601
|
-
#
|
602
|
-
def probabilities_for(ngrams)
|
603
|
-
table = {}
|
604
|
-
|
605
|
-
ngrams.each do |ngram|
|
606
|
-
table[ngram] = probability_of_ngram(ngram)
|
607
|
-
end
|
608
|
-
|
609
|
-
return table
|
610
|
-
end
|
611
|
-
|
612
|
-
#
|
613
|
-
# Returns the total observed frequency of the specified _ngrams_
|
614
|
-
# occurring within the training text.
|
615
|
-
#
|
616
|
-
def frequency_of_ngrams(ngrams)
|
617
|
-
frequencies_for(ngrams).values.inject do |total,freq|
|
618
|
-
total + freq
|
619
|
-
end
|
620
|
-
end
|
621
|
-
|
622
|
-
#
|
623
|
-
# Returns the joint probability of the specified _ngrams_ occurring
|
624
|
-
# within arbitrary text.
|
625
|
-
#
|
626
|
-
def probability_of_ngrams(ngrams)
|
627
|
-
probabilities_for(ngrams).values.inject do |joint,prob|
|
628
|
-
joint * prob
|
629
|
-
end
|
630
|
-
end
|
631
|
-
|
632
|
-
#
|
633
|
-
# Returns the probability of the specified _fragment_ occuring within
|
634
|
-
# arbitrary text.
|
635
|
-
#
|
636
|
-
def fragment_probability(fragment)
|
637
|
-
probability_of_ngrams(ngrams_from_fragment(fragment))
|
638
|
-
end
|
639
|
-
|
640
|
-
#
|
641
|
-
# Returns the probability of the specified _sentence_ occuring within
|
642
|
-
# arbitrary text.
|
643
|
-
#
|
644
|
-
def sentence_probability(sentence)
|
645
|
-
probability_of_ngrams(ngrams_from_sentence(sentence))
|
646
|
-
end
|
647
|
-
|
648
|
-
#
|
649
|
-
# Returns the probability of the specified _text_ occuring within
|
650
|
-
# arbitrary text.
|
651
|
-
#
|
652
|
-
def text_probability(text)
|
653
|
-
probability_of_ngrams(ngrams_from_text(text))
|
654
|
-
end
|
655
|
-
|
656
|
-
#
|
657
|
-
# Returns the joint probability of the common ngrams between the
|
658
|
-
# specified _fragment_ and the model.
|
659
|
-
#
|
660
|
-
def fragment_commonality(fragment)
|
661
|
-
probability_of_ngrams(common_ngrams_from_fragment(fragment))
|
662
|
-
end
|
663
|
-
|
664
|
-
#
|
665
|
-
# Returns the joint probability of the common ngrams between the
|
666
|
-
# specified _sentence_ and the model.
|
667
|
-
#
|
668
|
-
def sentence_commonality(sentence)
|
669
|
-
probability_of_ngrams(common_ngrams_from_sentence(sentence))
|
670
|
-
end
|
671
|
-
|
672
|
-
#
|
673
|
-
# Returns the joint probability of the common ngrams between the
|
674
|
-
# specified _sentence_ and the model.
|
675
|
-
#
|
676
|
-
def text_commonality(text)
|
677
|
-
probability_of_ngrams(common_ngrams_from_text(text))
|
678
|
-
end
|
679
|
-
|
680
|
-
#
|
681
|
-
# Returns the conditional probability of the commonality of the
|
682
|
-
# specified _fragment_ against the _other_model_, given the commonality
|
683
|
-
# of the _fragment_ against the model.
|
684
|
-
#
|
685
|
-
def fragment_similarity(fragment,other_model)
|
686
|
-
other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
|
687
|
-
end
|
688
|
-
|
689
|
-
#
|
690
|
-
# Returns the conditional probability of the commonality of the
|
691
|
-
# specified _sentence_ against the _other_model_, given the commonality
|
692
|
-
# of the _sentence_ against the model.
|
693
|
-
#
|
694
|
-
def sentence_similarity(sentence,other_model)
|
695
|
-
other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
|
696
|
-
end
|
697
|
-
|
698
|
-
#
|
699
|
-
# Returns the conditional probability of the commonality of the
|
700
|
-
# specified _text_ against the _other_model_, given the commonality
|
701
|
-
# of the _text_ against the model.
|
702
|
-
#
|
703
|
-
def text_similarity(text,other_model)
|
704
|
-
other_model.text_commonality(text) / text_commonality(text)
|
705
|
-
end
|
706
|
-
|
707
|
-
#
|
708
|
-
# Returns a random gram from the model.
|
709
|
-
#
|
710
|
-
def random_gram
|
711
|
-
prefix = @prefixes.keys[rand(@prefixes.length)]
|
712
|
-
|
713
|
-
return prefix[rand(prefix.length)]
|
714
|
-
end
|
715
|
-
|
716
|
-
#
|
717
|
-
# Returns a random ngram from the model.
|
718
|
-
#
|
719
|
-
def random_ngram
|
720
|
-
prefix_index = rand(@prefixes.length)
|
721
|
-
|
722
|
-
prefix = @prefixes.keys[prefix_index]
|
723
|
-
table = @prefixes.values[prefix_index]
|
724
|
-
|
725
|
-
gram_index = rand(table.grams.length)
|
726
|
-
|
727
|
-
return (prefix + table.grams[gram_index])
|
728
|
-
end
|
729
|
-
|
730
|
-
#
|
731
|
-
# Returns a randomly generated sentence of grams using the given
|
732
|
-
# _options_.
|
733
|
-
#
|
734
|
-
def random_gram_sentence(options={})
|
735
|
-
grams = []
|
736
|
-
last_ngram = @starting_ngram
|
737
|
-
|
738
|
-
loop do
|
739
|
-
next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
|
740
|
-
last_ngram = next_ngrams[rand(next_ngrams.length)]
|
741
|
-
|
742
|
-
if last_ngram.nil?
|
743
|
-
return []
|
744
|
-
else
|
745
|
-
last_gram = last_ngram.last
|
746
|
-
|
747
|
-
break if last_gram == Tokens.stop
|
748
|
-
|
749
|
-
grams << last_gram
|
750
|
-
end
|
751
|
-
end
|
752
|
-
|
753
|
-
return grams
|
754
|
-
end
|
755
|
-
|
756
|
-
#
|
757
|
-
# Returns a randomly generated sentence of text using the given
|
758
|
-
# _options_.
|
759
|
-
#
|
760
|
-
def random_sentence(options={})
|
761
|
-
grams = random_gram_sentence(options)
|
762
|
-
sentence = grams.delete_if { |gram|
|
763
|
-
gram == Tokens.start || gram == Tokens.stop
|
764
|
-
}.join(' ')
|
765
|
-
|
766
|
-
sentence << '.' if @ignore_punctuation
|
767
|
-
return sentence
|
768
|
-
end
|
769
|
-
|
770
|
-
#
|
771
|
-
# Returns a randomly generated paragraph of text using the given
|
772
|
-
# _options_.
|
773
|
-
#
|
774
|
-
# _options_ may contain the following keys:
|
775
|
-
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
776
|
-
# paragraph. Defaults to 3.
|
777
|
-
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
778
|
-
# paragraph. Defaults to 6.
|
779
|
-
#
|
780
|
-
def random_paragraph(options={})
|
781
|
-
min_sentences = (options[:min_sentences] || 3)
|
782
|
-
max_sentences = (options[:max_sentences] || 6)
|
783
|
-
sentences = []
|
784
|
-
|
785
|
-
(rand(max_sentences - min_sentences) + min_sentences).times do
|
786
|
-
sentences << random_sentence(options)
|
787
|
-
end
|
788
|
-
|
789
|
-
return sentences.join(' ')
|
790
|
-
end
|
791
|
-
|
792
|
-
#
|
793
|
-
# Returns randomly generated text using the given _options_.
|
794
|
-
#
|
795
|
-
# _options_ may contain the following keys:
|
796
|
-
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
797
|
-
# paragraph. Defaults to 3.
|
798
|
-
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
799
|
-
# paragraph. Defaults to 6.
|
800
|
-
# <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
|
801
|
-
# Defaults to 3.
|
802
|
-
# <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
|
803
|
-
# Defaults to 5.
|
804
|
-
#
|
805
|
-
def random_text(options={})
|
806
|
-
min_paragraphs = (options[:min_paragraphs] || 3)
|
807
|
-
max_paragraphs = (options[:max_paragraphs] || 6)
|
808
|
-
paragraphs = []
|
809
|
-
|
810
|
-
(rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
|
811
|
-
paragraphs << random_paragraph(options)
|
812
|
-
end
|
813
|
-
|
814
|
-
return paragraphs.join("\n\n")
|
815
|
-
end
|
816
|
-
|
817
536
|
#
|
818
537
|
# Refreshes the probability tables of the model.
|
819
538
|
#
|
@@ -854,6 +573,13 @@ module Raingrams
|
|
854
573
|
return self
|
855
574
|
end
|
856
575
|
|
576
|
+
#
|
577
|
+
# Returns a Hash representation of the model.
|
578
|
+
#
|
579
|
+
def to_hash
|
580
|
+
@prefixes
|
581
|
+
end
|
582
|
+
|
857
583
|
protected
|
858
584
|
|
859
585
|
#
|
@@ -141,6 +141,15 @@ module Raingrams
|
|
141
141
|
return self
|
142
142
|
end
|
143
143
|
|
144
|
+
#
|
145
|
+
# Returns a Hash representation of the probability table.
|
146
|
+
#
|
147
|
+
def to_hash
|
148
|
+
build
|
149
|
+
|
150
|
+
return @probabilities
|
151
|
+
end
|
152
|
+
|
144
153
|
def inspect
|
145
154
|
if @dirty
|
146
155
|
"#<ProbabilityTable @total=#{@total} @frequencies=#{@frequencies.inspect}>"
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'raingrams/tokens/start_sentence'
|
2
|
+
require 'raingrams/tokens/stop_sentence'
|
3
|
+
require 'raingrams/tokens/unknown'
|
4
|
+
|
5
|
+
module Raingrams
|
6
|
+
module Tokens
|
7
|
+
#
|
8
|
+
# Returns all defined tokens.
|
9
|
+
#
|
10
|
+
def Tokens.all
|
11
|
+
@@raingram_tokens ||= {}
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Returns the start sentence token.
|
16
|
+
#
|
17
|
+
def Tokens.start
|
18
|
+
Tokens.all[:start] ||= StartSentence.new
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Returns the stop sentence token.
|
23
|
+
#
|
24
|
+
def Tokens.stop
|
25
|
+
Tokens.all[:stop] ||= StopSentence.new
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Returns the unknown word token.
|
30
|
+
#
|
31
|
+
def Tokens.unknown
|
32
|
+
Tokens.all[:unknown] ||= Unknown.new
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/raingrams/version.rb
CHANGED
data/tasks/spec.rb
CHANGED
metadata
CHANGED
@@ -1,26 +1,26 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: raingrams
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Postmodern
|
7
|
+
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-04-23 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 1.2.0
|
24
24
|
version:
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: hoe
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: 1.12.2
|
34
34
|
version:
|
35
35
|
description: Raingrams is a flexible and general-purpose ngrams library written in Ruby. Raingrams supports ngram sizes greater than 1, text/non-text grams, multiple parsing styles and open/closed vocabulary models.
|
36
36
|
email:
|
@@ -45,7 +45,6 @@ extra_rdoc_files:
|
|
45
45
|
- Manifest.txt
|
46
46
|
- README.txt
|
47
47
|
- TODO.txt
|
48
|
-
- spec/training/snowcrash.txt
|
49
48
|
files:
|
50
49
|
- History.txt
|
51
50
|
- LICENSE.txt
|
@@ -54,27 +53,33 @@ files:
|
|
54
53
|
- TODO.txt
|
55
54
|
- Rakefile
|
56
55
|
- lib/raingrams.rb
|
57
|
-
- lib/raingrams/version.rb
|
58
|
-
- lib/raingrams/raingrams.rb
|
59
|
-
- lib/raingrams/exceptions/prefix_frequency_missing.rb
|
60
56
|
- lib/raingrams/exceptions.rb
|
57
|
+
- lib/raingrams/exceptions/prefix_frequency_missing.rb
|
58
|
+
- lib/raingrams/extensions.rb
|
61
59
|
- lib/raingrams/extensions/object.rb
|
62
60
|
- lib/raingrams/extensions/string.rb
|
63
|
-
- lib/raingrams/
|
61
|
+
- lib/raingrams/tokens.rb
|
64
62
|
- lib/raingrams/tokens/token.rb
|
65
63
|
- lib/raingrams/tokens/start_sentence.rb
|
66
64
|
- lib/raingrams/tokens/stop_sentence.rb
|
67
65
|
- lib/raingrams/tokens/unknown.rb
|
68
|
-
- lib/raingrams/tokens.rb
|
66
|
+
- lib/raingrams/tokens/tokens.rb
|
69
67
|
- lib/raingrams/ngram.rb
|
70
68
|
- lib/raingrams/ngram_set.rb
|
71
69
|
- lib/raingrams/probability_table.rb
|
70
|
+
- lib/raingrams/helpers.rb
|
71
|
+
- lib/raingrams/helpers/frequency.rb
|
72
|
+
- lib/raingrams/helpers/probability.rb
|
73
|
+
- lib/raingrams/helpers/similarity.rb
|
74
|
+
- lib/raingrams/helpers/commonality.rb
|
75
|
+
- lib/raingrams/helpers/random.rb
|
72
76
|
- lib/raingrams/model.rb
|
73
77
|
- lib/raingrams/bigram_model.rb
|
74
78
|
- lib/raingrams/trigram_model.rb
|
75
79
|
- lib/raingrams/quadgram_model.rb
|
76
80
|
- lib/raingrams/pentagram_model.rb
|
77
81
|
- lib/raingrams/hexagram_model.rb
|
82
|
+
- lib/raingrams/open_vocabulary.rb
|
78
83
|
- lib/raingrams/open_vocabulary/open_model.rb
|
79
84
|
- lib/raingrams/open_vocabulary/model.rb
|
80
85
|
- lib/raingrams/open_vocabulary/bigram_model.rb
|
@@ -82,7 +87,8 @@ files:
|
|
82
87
|
- lib/raingrams/open_vocabulary/quadgram_model.rb
|
83
88
|
- lib/raingrams/open_vocabulary/pentagram_model.rb
|
84
89
|
- lib/raingrams/open_vocabulary/hexagram_model.rb
|
85
|
-
- lib/raingrams/
|
90
|
+
- lib/raingrams/version.rb
|
91
|
+
- lib/raingrams/raingrams.rb
|
86
92
|
- tasks/spec.rb
|
87
93
|
- spec/training/snowcrash.txt
|
88
94
|
- spec/helpers/training.rb
|
@@ -121,7 +127,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
121
127
|
requirements: []
|
122
128
|
|
123
129
|
rubyforge_project: raingrams
|
124
|
-
rubygems_version: 1.3.
|
130
|
+
rubygems_version: 1.3.1
|
125
131
|
signing_key:
|
126
132
|
specification_version: 2
|
127
133
|
summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
|