text-analysis-utils 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/cache-document +15 -0
- data/bin/classify-new-words +69 -0
- data/bin/find-examples-for +22 -0
- data/bin/frequency-list +21 -0
- data/bin/lookup +9 -0
- data/bin/percentage-known-of +40 -0
- data/bin/play-with-blanks +28 -0
- data/bin/play-with-examples +25 -0
- data/bin/prepare-text +9 -0
- data/bin/proximity-of-words +44 -0
- data/bin/readability-of +54 -0
- data/bin/vocabulary-coverage +16 -0
- data/lib/document-cache.rb +98 -0
- data/lib/game.rb +122 -0
- data/lib/lookup.rb +15 -0
- data/lib/text-analysis-utils.rb +4 -0
- data/lib/vocabulary-chest.rb +64 -0
- metadata +93 -0
data/bin/cache-document
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
4
|
+
|
5
|
+
def get_text
|
6
|
+
if ARGV.empty?
|
7
|
+
STDIN.read
|
8
|
+
else
|
9
|
+
text = ""
|
10
|
+
ARGV.each{|filename| File.open(filename){|file| text += file.read}}
|
11
|
+
text
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
DocumentCache.add(get_text)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby1.9.3
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'colorize'
|
5
|
+
require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
|
6
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
7
|
+
|
8
|
+
|
9
|
+
def get_text
|
10
|
+
if !ARGV.empty?
|
11
|
+
text = ""
|
12
|
+
ARGV.each {|filename|
|
13
|
+
text += File.open(filename, 'r'){|file| file.read}
|
14
|
+
}
|
15
|
+
text
|
16
|
+
else
|
17
|
+
STDIN.read
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def ask word, index, words, text
|
22
|
+
location = (text =~ /\b#{Regexp.escape(word)}\b/)
|
23
|
+
puts "!!!!" if word == "notwendig"
|
24
|
+
location = text.index(word) if location.nil?
|
25
|
+
(puts "Skipping word: #{word}"; return 'skip') if location.nil?
|
26
|
+
|
27
|
+
beginning_of_snippet = location - 15 < 0 ? 0 : location - 15
|
28
|
+
snippet = text[beginning_of_snippet, 30 + word.size].gsub(/[\r\n]/," ")
|
29
|
+
|
30
|
+
answer = ''
|
31
|
+
while !['y','n','skip'].include?(answer)
|
32
|
+
puts
|
33
|
+
puts "------------------------------------------------------------"
|
34
|
+
puts "...#{snippet.gsub(word, word.green)}..."
|
35
|
+
puts
|
36
|
+
puts "Do you know this word? [y or n or skip] (#{index + 1} of #{words.size})"
|
37
|
+
STDOUT.write("> ")
|
38
|
+
|
39
|
+
answer = STDIN.gets.gsub(/[\r\n]/, '')
|
40
|
+
end
|
41
|
+
answer
|
42
|
+
end
|
43
|
+
|
44
|
+
def collect_words_from textual_words
|
45
|
+
textual_words.reject!{|w|VocabularyChest::is_known?(w)}
|
46
|
+
textual_words.reject!{|w|VocabularyChest::contains?(w)} if @@options.include? "-n"
|
47
|
+
|
48
|
+
words_by_stem = textual_words.inject({}){|hash, w| hash[VocabularyChest::stem w] = w; hash}
|
49
|
+
words_by_stem.reject!{|s,w| (s =~ /^[-\d\s,\.]*$/) == 0}
|
50
|
+
|
51
|
+
words_by_stem.values.uniq
|
52
|
+
end
|
53
|
+
|
54
|
+
@@options = ARGV.select{|arg| ["-n"].include? arg}
|
55
|
+
ARGV.reject!{|arg| @@options.include? arg}
|
56
|
+
|
57
|
+
text = get_text
|
58
|
+
textual_words = text.split(" ").collect{|w| w.chomp}
|
59
|
+
puts "Thanks. Please wait..."
|
60
|
+
|
61
|
+
words = collect_words_from textual_words
|
62
|
+
words.each_with_index {|word, index|
|
63
|
+
match = DocumentCache::extract_matching_words(word, text).first
|
64
|
+
answer = ask match, index, words, text
|
65
|
+
VocabularyChest::add_to_known_words(word) if answer == 'y'
|
66
|
+
VocabularyChest::add_to_unknown_words(word) if answer == 'n'
|
67
|
+
}
|
68
|
+
|
69
|
+
puts "Done."
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'colorize'
|
5
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
6
|
+
|
7
|
+
count = 1
|
8
|
+
count_param = ARGV.find{|a| (a =~ /--\d*/) == 0}
|
9
|
+
if !count_param.nil?
|
10
|
+
count = count_param.sub("--","").to_i
|
11
|
+
ARGV.reject!{|a| a == count_param}
|
12
|
+
end
|
13
|
+
|
14
|
+
search = ARGV.join(" ")
|
15
|
+
matches = DocumentCache.find_examples_for search, count
|
16
|
+
exit(1) if matches.empty?
|
17
|
+
|
18
|
+
puts matches.map{|sentence, tokens|
|
19
|
+
colored_sentence = sentence.dup
|
20
|
+
tokens.each{|m| colored_sentence.gsub!(m, m.green) }
|
21
|
+
colored_sentence
|
22
|
+
}.join("\n")
|
data/bin/frequency-list
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
4
|
+
require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
|
5
|
+
|
6
|
+
text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text}
|
7
|
+
|
8
|
+
frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
|
9
|
+
|
10
|
+
frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
|
11
|
+
|
12
|
+
def output frequencies
|
13
|
+
STDOUT.sync = true
|
14
|
+
frequencies.each{|k,v| puts "#{v.size}\t#{k}\t#{v[0,6].join(",")}#{v.size > 6 ? "..." : ""}"}
|
15
|
+
end
|
16
|
+
|
17
|
+
if ARGV[0] == "--unknown"
|
18
|
+
output frequencies.find_all{|k,v| !VocabularyChest::is_known?(v[0])}
|
19
|
+
else
|
20
|
+
output frequencies
|
21
|
+
end
|
data/bin/lookup
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
|
4
|
+
require File.join(File.dirname(__FILE__), '../lib/lookup' )
|
5
|
+
|
6
|
+
def analyse text
|
7
|
+
words = text.split(" ")
|
8
|
+
known = words.select{|w| VocabularyChest.is_known? w}
|
9
|
+
unknown = (words - known)
|
10
|
+
return [known.map{|w| VocabularyChest::sanitize w}.uniq, unknown.map{|w| VocabularyChest::sanitize w}.uniq]
|
11
|
+
end
|
12
|
+
|
13
|
+
def output options
|
14
|
+
known, unknown = options
|
15
|
+
size = known.size + unknown.size
|
16
|
+
|
17
|
+
puts
|
18
|
+
puts "--"
|
19
|
+
puts "UNKNOWN WORDS: #{unknown.join(", ")}"
|
20
|
+
puts
|
21
|
+
puts "DEFINITIONS"
|
22
|
+
puts Lookup::go(unknown)
|
23
|
+
puts "--"
|
24
|
+
puts
|
25
|
+
puts "Total number of unknown words: #{unknown.size}"
|
26
|
+
puts "Total number of known words: #{known.size}"
|
27
|
+
puts "Total number of words: #{size}"
|
28
|
+
puts "Percentage of words known: #{'%.2f' % (known.size.to_f / size * 100)}%"
|
29
|
+
end
|
30
|
+
|
31
|
+
if !ARGV.empty?
|
32
|
+
ARGV.each {|filename|
|
33
|
+
text = File.open(filename,'r'){|file| file.read}
|
34
|
+
puts "#{filename}:"
|
35
|
+
output(analyse(text))
|
36
|
+
}
|
37
|
+
else
|
38
|
+
text = STDIN.read
|
39
|
+
output(analyse(text))
|
40
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/game' )
|
4
|
+
|
5
|
+
def get_input
|
6
|
+
if !ARGV.empty?
|
7
|
+
else
|
8
|
+
STDIN.read
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
(puts "Usage: #{$0} <file with words to practice> <file with examples>"; exit(1)) if ARGV.size < 2
|
13
|
+
|
14
|
+
input = File.open(ARGV.shift){|f| f.read}
|
15
|
+
words = input.split("\n").uniq
|
16
|
+
|
17
|
+
example_sentences = []
|
18
|
+
ARGV.each{|filename| example_sentences += File.open(filename).readlines}
|
19
|
+
example_sentences.map!{|s| s.chomp}
|
20
|
+
example_sentences.reject!{|s| words.find{|w| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
|
21
|
+
words.reject!{|w| example_sentences.find{|s| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
|
22
|
+
|
23
|
+
puts "Playing with #{example_sentences.size} sentences and #{words.size} words."
|
24
|
+
|
25
|
+
Game.new(words).play{|word|
|
26
|
+
sentence = example_sentences.shuffle.find{|s| (s =~ /\b#{Regexp.escape(word)}\b/i) != nil}
|
27
|
+
[sentence, $&]
|
28
|
+
}
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
4
|
+
require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
|
5
|
+
require File.join(File.dirname(__FILE__), '../lib/game' )
|
6
|
+
|
7
|
+
def get_input
|
8
|
+
if !ARGV.empty?
|
9
|
+
File.open(ARGV[0]){|f| f.read}
|
10
|
+
else
|
11
|
+
STDIN.read
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
input = get_input
|
16
|
+
words = input.split("\n")
|
17
|
+
words.reject!{|w| STDOUT.write("."); STDOUT.flush; DocumentCache.find_examples_for(w).empty?}
|
18
|
+
puts
|
19
|
+
|
20
|
+
Game.new(words).play{ |word|
|
21
|
+
matches = DocumentCache.find_examples_for(word, 10).keys
|
22
|
+
sentence = matches.sort{|a, b| a.size <=> b.size}.first
|
23
|
+
correct_answer = DocumentCache::extract_matching_words(word, sentence).first
|
24
|
+
[sentence, correct_answer]
|
25
|
+
}
|
data/bin/prepare-text
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
text = STDIN.read
|
4
|
+
File.open("/tmp/prepared-text", 'w'){|f| f.write(text)}
|
5
|
+
exec("classify-new-words /tmp/prepared-text && \
|
6
|
+
cache-document /tmp/prepared-text && \
|
7
|
+
echo '\nREADABILITY STATISTICS' && \
|
8
|
+
readability-of /tmp/prepared-text && \
|
9
|
+
percentage-known-of /tmp/prepared-text")
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'amatch'
|
5
|
+
|
6
|
+
def distance w1, w2
|
7
|
+
Amatch::Levenshtein.new(w1).match(w2)
|
8
|
+
end
|
9
|
+
|
10
|
+
def analyse text, known_text
|
11
|
+
words = words_of(text)
|
12
|
+
known_words = words_of(known_text)
|
13
|
+
|
14
|
+
words.map {|w|
|
15
|
+
closest_word, proximity = find_closest_word(w, known_words)
|
16
|
+
puts "#{w}\t#{closest_word}\t#{proximity}"
|
17
|
+
STDOUT.flush
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_closest_word word, known_words
|
22
|
+
closest_word = known_words.sort{|w1, w2| distance(w1, word) <=> distance(w2, word)}.first
|
23
|
+
|
24
|
+
[closest_word, distance(closest_word, word)]
|
25
|
+
end
|
26
|
+
|
27
|
+
def words_of text
|
28
|
+
words = text.split(" ").uniq
|
29
|
+
end
|
30
|
+
|
31
|
+
if ARGV.size < 2
|
32
|
+
puts "usage: ./script <new text> <known text>"
|
33
|
+
exit 1
|
34
|
+
end
|
35
|
+
|
36
|
+
filename = ARGV.shift
|
37
|
+
text = File.open(filename,'r'){|file| file.read}
|
38
|
+
|
39
|
+
known_text ||= ""
|
40
|
+
ARGV.each {|filename|
|
41
|
+
known_text += File.open(filename,'r'){|file| file.read}
|
42
|
+
}
|
43
|
+
|
44
|
+
analyse text, known_text
|
data/bin/readability-of
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
def analyse text
|
4
|
+
words = text.split(" ").size.to_f
|
5
|
+
sentences = text.split(/\.|\?|!/).reject{|s| s.strip.empty?}.size.to_f
|
6
|
+
syllables = text.split(" ").inject([]){|sum, w| sum + vowels(w)}
|
7
|
+
syllables = syllables.size.to_f * 0.9 # for silent vowels
|
8
|
+
words_with_more_than_three_syllables = text.split(" ").select{|w| vowels(w).size >= 3}
|
9
|
+
ms = words_with_more_than_three_syllables.size.to_f / text.split(" ").size.to_f * 100
|
10
|
+
|
11
|
+
stats = {:words => words,
|
12
|
+
:sentences => sentences,
|
13
|
+
:syllables => syllables,
|
14
|
+
:ms => ms,
|
15
|
+
:wiener_sachtextformel => wiener_sachtextformel(sentences, words, ms),
|
16
|
+
:grade_level => grade(sentences, words, syllables)}
|
17
|
+
end
|
18
|
+
|
19
|
+
def wiener_sachtextformel sentences, words, ms
|
20
|
+
0.2656 * (words / sentences) + 0.2744 * ms -1.693
|
21
|
+
end
|
22
|
+
|
23
|
+
def grade sentences, words, syllables
|
24
|
+
(0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59)
|
25
|
+
end
|
26
|
+
|
27
|
+
def vowels w
|
28
|
+
w.split(/b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|z/i).reject{|s|s.empty?}
|
29
|
+
end
|
30
|
+
|
31
|
+
def output options
|
32
|
+
puts
|
33
|
+
if !options[:source].nil?
|
34
|
+
puts "#{options[:source]}:"
|
35
|
+
end
|
36
|
+
puts "Number of sentences: #{options[:sentences]}"
|
37
|
+
puts "Number of words: #{options[:words]}"
|
38
|
+
puts "Number of syllabes: #{options[:syllables]}"
|
39
|
+
puts "Average number of syllables per word: #{ '%.2f' % (options[:syllables] / options[:words])}"
|
40
|
+
puts "Average number of words per sentence: #{'%.2f' % (options[:words] / options[:sentences])}"
|
41
|
+
puts "Wiener Sachtextformel: #{'%.2f' % options[:wiener_sachtextformel]}"
|
42
|
+
puts "Flesch-Kincaid Grade Level: #{'%.2f' % options[:grade_level]}"
|
43
|
+
puts
|
44
|
+
end
|
45
|
+
|
46
|
+
if !ARGV.empty?
|
47
|
+
ARGV.each {|filename|
|
48
|
+
text = File.open(filename,'r'){|file| file.read}
|
49
|
+
output(analyse(text).merge(:source => filename))
|
50
|
+
}
|
51
|
+
else
|
52
|
+
text = STDIN.read
|
53
|
+
output(analyse(text))
|
54
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
command =<<EOF
|
4
|
+
|
5
|
+
total_occurrences=`expr $(frequency-list | awk '{print $1}' | xargs | sed 's/ / + /g')`
|
6
|
+
unknown_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | xargs | sed 's/ / + /g')`
|
7
|
+
next_500_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | head -500 | xargs | sed 's/ / + /g')`
|
8
|
+
|
9
|
+
echo Total occurrences: $total_occurrences
|
10
|
+
echo Unknown occurrences: $unknown_occurrences
|
11
|
+
echo Your current vocabulary knowledge covers $(echo "scale=2;($total_occurrences - $unknown_occurrences) / $total_occurrences * 100" | bc -q)% of all occurrences
|
12
|
+
echo The next 500 words will bring your cover to $(echo "scale=2;($total_occurrences - $unknown_occurrences + $next_500_occurrences) / $total_occurrences * 100" | bc -q)%
|
13
|
+
|
14
|
+
EOF
|
15
|
+
|
16
|
+
system command
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'fileutils.rb'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'uuid'
|
4
|
+
|
5
|
+
require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
|
6
|
+
|
7
|
+
CACHE_DIR = "#{ROOT_DIR}/docs"
|
8
|
+
|
9
|
+
FileUtils::mkdir_p(ROOT_DIR)
|
10
|
+
FileUtils::mkdir_p(CACHE_DIR)
|
11
|
+
|
12
|
+
module DocumentCache
|
13
|
+
def self.add search
|
14
|
+
filename = "#{CACHE_DIR}/#{UUID.new.generate}"
|
15
|
+
File.open(filename,'w'){|f| f.write(search)}
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.find_matches_by_stemming search, sentences
|
19
|
+
token = VocabularyChest::stem(search)
|
20
|
+
sentences.inject({}){|hash, s|
|
21
|
+
words = s.split(" ")
|
22
|
+
found = words.select{|w| VocabularyChest::stem(w) == token}
|
23
|
+
hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty?
|
24
|
+
hash
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.find_matches_by_grepping search, sentences
|
29
|
+
sentences.inject({}){|hash, s|
|
30
|
+
hash[clean(s)] = [search] if s.include? search
|
31
|
+
hash
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.find_matches_in filenames, search, count
|
36
|
+
matches = {}
|
37
|
+
|
38
|
+
[:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
|
39
|
+
filenames.each {|filename|
|
40
|
+
File.open(filename){|file|
|
41
|
+
contents = file.read
|
42
|
+
sentences = contents.split(/[\.?!\n]/)
|
43
|
+
matches.merge!(self.send(matcher, search, sentences))
|
44
|
+
|
45
|
+
matches.shift until matches.size <= count if matches.size > count
|
46
|
+
return matches if matches.size == count
|
47
|
+
}
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
matches
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.documents
|
55
|
+
Dir["#{CACHE_DIR}/*"]
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.find_examples_for search, count=1
|
59
|
+
find_matches_in documents, search, count
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.clean(sentence)
|
63
|
+
sentence.strip + "."
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.extract_matching_words search, sentence
|
67
|
+
matches = find_matches_by_stemming(search, [sentence])
|
68
|
+
return matches.values.first if !matches.empty?
|
69
|
+
return find_matches_by_grepping(search, [sentence]).values.first
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.frequency_list
|
73
|
+
text = ""
|
74
|
+
documents.each{|f| text += File.open(f).read }
|
75
|
+
counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
|
76
|
+
counts.reject!{|word, count| count < 2}
|
77
|
+
counts.sort_by {|k,v| v}.reverse
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.stemmed_frequency_list
|
81
|
+
text = ""
|
82
|
+
documents.each{|f| text += File.open(f).read }
|
83
|
+
stems = text.split(" ").map{|w| VocabularyChest::stem(w)}
|
84
|
+
counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
|
85
|
+
counts.reject!{|stem, count| count < 2}
|
86
|
+
counts.sort_by {|k,v| v}.reverse
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
if __FILE__ == $0
|
91
|
+
puts "The document cache contains #{DocumentCache.documents.size} documents."
|
92
|
+
puts
|
93
|
+
puts "Here are the 10 most frequent stems:"
|
94
|
+
DocumentCache.stemmed_frequency_list[0,10].each{|stem, count| puts "#{count} #{stem}"}
|
95
|
+
puts
|
96
|
+
puts "Here are the 10 most frequent words:"
|
97
|
+
DocumentCache.frequency_list[0,10].each{|word, count| puts "#{count} #{word}"}
|
98
|
+
end
|
data/lib/game.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'amatch'
|
3
|
+
require 'colorize'
|
4
|
+
|
5
|
+
class Game
|
6
|
+
def initialize words
|
7
|
+
@words = words
|
8
|
+
@results = []
|
9
|
+
@turn = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def pick_choices_for word
|
13
|
+
others = @words.reject{|w| w == word}
|
14
|
+
choices = (others.shuffle[0,4] + [word])
|
15
|
+
choices.shuffle
|
16
|
+
end
|
17
|
+
|
18
|
+
def hit_rate
|
19
|
+
number_of_turns_we_remember = [@words.size, @turn].min
|
20
|
+
recent_results = number_of_turns_we_remember < @results.size ? @results[@results.size - number_of_turns_we_remember, @results.size] : @results
|
21
|
+
hits = recent_results.inject(0){|sum, value| sum+=value; sum}
|
22
|
+
misses = recent_results.size - hits
|
23
|
+
|
24
|
+
rate = 100 - misses / number_of_turns_we_remember.to_f * 100
|
25
|
+
rate >= 0 ? rate : 0
|
26
|
+
end
|
27
|
+
|
28
|
+
def romanize w
|
29
|
+
w.gsub("ä", "ae").gsub("ö", "oe").gsub("ü", "ue").gsub("ß", "ss")
|
30
|
+
end
|
31
|
+
|
32
|
+
def proximity one, other
|
33
|
+
Amatch::Levenshtein.new(romanize(one).downcase).match(romanize(other).downcase)
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_an_answer_for correct_answer
|
37
|
+
STDOUT.write("> ")
|
38
|
+
answer = STDIN.gets.chomp
|
39
|
+
|
40
|
+
while answer != '?' and proximity(answer, correct_answer) > 1
|
41
|
+
@results << 0
|
42
|
+
display_definition(answer)
|
43
|
+
puts
|
44
|
+
puts "Nope. Try again.".red
|
45
|
+
puts
|
46
|
+
STDOUT.write("> ")
|
47
|
+
answer = STDIN.gets.chomp
|
48
|
+
end
|
49
|
+
|
50
|
+
answer
|
51
|
+
end
|
52
|
+
|
53
|
+
def fetch_definition word
|
54
|
+
definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
|
55
|
+
definitions.uniq.join(" -- ")
|
56
|
+
end
|
57
|
+
|
58
|
+
def display_definition word
|
59
|
+
definition = fetch_definition(word)
|
60
|
+
puts "\n#{word.blue} means: #{definition}" if !definition.empty?
|
61
|
+
end
|
62
|
+
|
63
|
+
def show_question_for word, sentence
|
64
|
+
puts
|
65
|
+
puts
|
66
|
+
puts
|
67
|
+
puts "Turn #{@turn}".blue
|
68
|
+
puts "#{@words.size} words".blue
|
69
|
+
color = hit_rate < 75 ? :red : (hit_rate < 90 ? :yellow : 'green')
|
70
|
+
puts "Hit rate: #{'%i' % hit_rate}%".send(color)
|
71
|
+
puts "------------------------------------------------------------".blue
|
72
|
+
puts
|
73
|
+
puts ((sentence =~ /\b#{Regexp.escape(word)}\b/i) != nil ? sentence.gsub(/\b#{Regexp.escape(word)}\b/i, "______") : sentence.gsub(word, "______")).strip
|
74
|
+
puts
|
75
|
+
puts "Choices: [" + " #{pick_choices_for(word).join(" - ")} ".blue + "]"
|
76
|
+
puts
|
77
|
+
end
|
78
|
+
|
79
|
+
def end_game
|
80
|
+
puts
|
81
|
+
puts
|
82
|
+
puts "#{"Congratulations!".green} Here is a star for you: #{'*'.yellow}"
|
83
|
+
puts
|
84
|
+
end
|
85
|
+
|
86
|
+
def respond_to_answer answer, correct_answer
|
87
|
+
puts
|
88
|
+
if answer == '?'
|
89
|
+
@results << 0
|
90
|
+
puts "The answer was: #{correct_answer.red}."
|
91
|
+
display_definition(correct_answer)
|
92
|
+
sleep 2
|
93
|
+
elsif proximity(answer, correct_answer) > 0
|
94
|
+
@results << 1
|
95
|
+
puts "Sort of... the answer was: #{correct_answer.yellow}."
|
96
|
+
display_definition(correct_answer)
|
97
|
+
sleep 1
|
98
|
+
else
|
99
|
+
@results << 1
|
100
|
+
puts "Correct! The answer was: #{correct_answer.green}."
|
101
|
+
display_definition(correct_answer)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def play &block
|
106
|
+
@words.shuffle.each{|word|
|
107
|
+
@turn += 1
|
108
|
+
|
109
|
+
sentence, correct_answer = yield word
|
110
|
+
(puts "Could not find anything to play with for word #{word}."; next) if sentence.nil? or correct_answer.nil?
|
111
|
+
|
112
|
+
show_question_for(correct_answer, sentence)
|
113
|
+
answer = get_an_answer_for(correct_answer)
|
114
|
+
respond_to_answer(answer, correct_answer)
|
115
|
+
|
116
|
+
(end_game; return) if @turn >= @words.size and (hit_rate >= 95)
|
117
|
+
}
|
118
|
+
|
119
|
+
play(&block)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
data/lib/lookup.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
module Lookup
|
2
|
+
|
3
|
+
def self.fetch_definition word
|
4
|
+
definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
|
5
|
+
definitions.uniq.join(" -- ")
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.sanitize word
|
9
|
+
word.gsub(/[,\.]/,"")
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.go words
|
13
|
+
words.map{|w| sanitize w}.map{|w| "#{w}\t#{fetch_definition w}"}.join("\n")
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'fileutils.rb'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'lingua/stemmer'
|
5
|
+
|
6
|
+
ROOT_DIR = File.expand_path(ENV['chest_location'] || "~/.vocabulary-chest")
|
7
|
+
KNOWN_FILE = "#{ROOT_DIR}/known"
|
8
|
+
UNKNOWN_FILE = "#{ROOT_DIR}/unknown"
|
9
|
+
|
10
|
+
FileUtils::mkdir_p(ROOT_DIR)
|
11
|
+
FileUtils.touch(KNOWN_FILE)
|
12
|
+
FileUtils.touch(UNKNOWN_FILE)
|
13
|
+
|
14
|
+
module VocabularyChest
|
15
|
+
@known_file = File.open(KNOWN_FILE,'a')
|
16
|
+
@unknown_file = File.open(UNKNOWN_FILE,'a')
|
17
|
+
@known_words = nil
|
18
|
+
@unknown_words = nil
|
19
|
+
@stemmer= Lingua::Stemmer.new(:language => "de")
|
20
|
+
|
21
|
+
at_exit {@known_file.close}
|
22
|
+
at_exit {@unknown_file.close}
|
23
|
+
|
24
|
+
def self.known_words
|
25
|
+
@known_words ||= File.open(KNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.unknown_words
|
29
|
+
@unknown_words ||= File.open(UNKNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.add_to_known_words word
|
33
|
+
@known_file.puts(stem word)
|
34
|
+
@known_file.flush
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.add_to_unknown_words word
|
38
|
+
@unknown_file.puts(stem word)
|
39
|
+
@unknown_file.flush
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.contains? word
|
43
|
+
stemmed_word = stem word
|
44
|
+
known_words.include?(stemmed_word) or unknown_words.include?(stemmed_word)
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.is_known? word
|
48
|
+
known_words.include?(stem(word)) or sanitize(word).empty? or (sanitize(word) =~ /^[-\d]*$/) != nil
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.stem word
|
52
|
+
@stemmer.stem(sanitize word).downcase
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.sanitize word
|
56
|
+
word.gsub(/[,\"\.:;()?!„“]/,"")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
if __FILE__ == $0
|
61
|
+
known = VocabularyChest::known_words
|
62
|
+
unknown = VocabularyChest::unknown_words
|
63
|
+
puts "The chest contains #{known.size} known words."
|
64
|
+
end
|
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text-analysis-utils
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 9
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: "0.1"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Matt
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2012-06-21 00:00:00 Z
|
18
|
+
dependencies: []
|
19
|
+
|
20
|
+
description:
|
21
|
+
email:
|
22
|
+
executables:
|
23
|
+
- cache-document
|
24
|
+
- classify-new-words
|
25
|
+
- find-examples-for
|
26
|
+
- frequency-list
|
27
|
+
- lookup
|
28
|
+
- percentage-known-of
|
29
|
+
- play-with-blanks
|
30
|
+
- play-with-examples
|
31
|
+
- prepare-text
|
32
|
+
- proximity-of-words
|
33
|
+
- readability-of
|
34
|
+
- vocabulary-coverage
|
35
|
+
extensions: []
|
36
|
+
|
37
|
+
extra_rdoc_files: []
|
38
|
+
|
39
|
+
files:
|
40
|
+
- lib/text-analysis-utils.rb
|
41
|
+
- lib/document-cache.rb
|
42
|
+
- lib/vocabulary-chest.rb
|
43
|
+
- lib/game.rb
|
44
|
+
- lib/lookup.rb
|
45
|
+
- bin/cache-document
|
46
|
+
- bin/classify-new-words
|
47
|
+
- bin/find-examples-for
|
48
|
+
- bin/frequency-list
|
49
|
+
- bin/lookup
|
50
|
+
- bin/percentage-known-of
|
51
|
+
- bin/play-with-blanks
|
52
|
+
- bin/play-with-examples
|
53
|
+
- bin/prepare-text
|
54
|
+
- bin/proximity-of-words
|
55
|
+
- bin/readability-of
|
56
|
+
- bin/vocabulary-coverage
|
57
|
+
homepage: http://github.com/matstc/text-analysis-utils
|
58
|
+
licenses: []
|
59
|
+
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
hash: 57
|
71
|
+
segments:
|
72
|
+
- 1
|
73
|
+
- 8
|
74
|
+
- 7
|
75
|
+
version: 1.8.7
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
none: false
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
hash: 3
|
82
|
+
segments:
|
83
|
+
- 0
|
84
|
+
version: "0"
|
85
|
+
requirements: []
|
86
|
+
|
87
|
+
rubyforge_project:
|
88
|
+
rubygems_version: 1.8.15
|
89
|
+
signing_key:
|
90
|
+
specification_version: 3
|
91
|
+
summary: Utilities to help language learners
|
92
|
+
test_files: []
|
93
|
+
|