text-analysis-utils 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/cache-document +15 -0
- data/bin/classify-new-words +69 -0
- data/bin/find-examples-for +22 -0
- data/bin/frequency-list +21 -0
- data/bin/lookup +9 -0
- data/bin/percentage-known-of +40 -0
- data/bin/play-with-blanks +28 -0
- data/bin/play-with-examples +25 -0
- data/bin/prepare-text +9 -0
- data/bin/proximity-of-words +44 -0
- data/bin/readability-of +54 -0
- data/bin/vocabulary-coverage +16 -0
- data/lib/document-cache.rb +98 -0
- data/lib/game.rb +122 -0
- data/lib/lookup.rb +15 -0
- data/lib/text-analysis-utils.rb +4 -0
- data/lib/vocabulary-chest.rb +64 -0
- metadata +93 -0
data/bin/cache-document
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
4
|
+
|
5
|
+
def get_text
|
6
|
+
if ARGV.empty?
|
7
|
+
STDIN.read
|
8
|
+
else
|
9
|
+
text = ""
|
10
|
+
ARGV.each{|filename| File.open(filename){|file| text += file.read}}
|
11
|
+
text
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
DocumentCache.add(get_text)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby1.9.3
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'colorize'
|
5
|
+
require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
|
6
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
7
|
+
|
8
|
+
|
9
|
+
def get_text
|
10
|
+
if !ARGV.empty?
|
11
|
+
text = ""
|
12
|
+
ARGV.each {|filename|
|
13
|
+
text += File.open(filename, 'r'){|file| file.read}
|
14
|
+
}
|
15
|
+
text
|
16
|
+
else
|
17
|
+
STDIN.read
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def ask word, index, words, text
|
22
|
+
location = (text =~ /\b#{Regexp.escape(word)}\b/)
|
23
|
+
puts "!!!!" if word == "notwendig"
|
24
|
+
location = text.index(word) if location.nil?
|
25
|
+
(puts "Skipping word: #{word}"; return 'skip') if location.nil?
|
26
|
+
|
27
|
+
beginning_of_snippet = location - 15 < 0 ? 0 : location - 15
|
28
|
+
snippet = text[beginning_of_snippet, 30 + word.size].gsub(/[\r\n]/," ")
|
29
|
+
|
30
|
+
answer = ''
|
31
|
+
while !['y','n','skip'].include?(answer)
|
32
|
+
puts
|
33
|
+
puts "------------------------------------------------------------"
|
34
|
+
puts "...#{snippet.gsub(word, word.green)}..."
|
35
|
+
puts
|
36
|
+
puts "Do you know this word? [y or n or skip] (#{index + 1} of #{words.size})"
|
37
|
+
STDOUT.write("> ")
|
38
|
+
|
39
|
+
answer = STDIN.gets.gsub(/[\r\n]/, '')
|
40
|
+
end
|
41
|
+
answer
|
42
|
+
end
|
43
|
+
|
44
|
+
def collect_words_from textual_words
|
45
|
+
textual_words.reject!{|w|VocabularyChest::is_known?(w)}
|
46
|
+
textual_words.reject!{|w|VocabularyChest::contains?(w)} if @@options.include? "-n"
|
47
|
+
|
48
|
+
words_by_stem = textual_words.inject({}){|hash, w| hash[VocabularyChest::stem w] = w; hash}
|
49
|
+
words_by_stem.reject!{|s,w| (s =~ /^[-\d\s,\.]*$/) == 0}
|
50
|
+
|
51
|
+
words_by_stem.values.uniq
|
52
|
+
end
|
53
|
+
|
54
|
+
@@options = ARGV.select{|arg| ["-n"].include? arg}
|
55
|
+
ARGV.reject!{|arg| @@options.include? arg}
|
56
|
+
|
57
|
+
text = get_text
|
58
|
+
textual_words = text.split(" ").collect{|w| w.chomp}
|
59
|
+
puts "Thanks. Please wait..."
|
60
|
+
|
61
|
+
words = collect_words_from textual_words
|
62
|
+
words.each_with_index {|word, index|
|
63
|
+
match = DocumentCache::extract_matching_words(word, text).first
|
64
|
+
answer = ask match, index, words, text
|
65
|
+
VocabularyChest::add_to_known_words(word) if answer == 'y'
|
66
|
+
VocabularyChest::add_to_unknown_words(word) if answer == 'n'
|
67
|
+
}
|
68
|
+
|
69
|
+
puts "Done."
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'colorize'
|
5
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
6
|
+
|
7
|
+
count = 1
|
8
|
+
count_param = ARGV.find{|a| (a =~ /--\d*/) == 0}
|
9
|
+
if !count_param.nil?
|
10
|
+
count = count_param.sub("--","").to_i
|
11
|
+
ARGV.reject!{|a| a == count_param}
|
12
|
+
end
|
13
|
+
|
14
|
+
search = ARGV.join(" ")
|
15
|
+
matches = DocumentCache.find_examples_for search, count
|
16
|
+
exit(1) if matches.empty?
|
17
|
+
|
18
|
+
puts matches.map{|sentence, tokens|
|
19
|
+
colored_sentence = sentence.dup
|
20
|
+
tokens.each{|m| colored_sentence.gsub!(m, m.green) }
|
21
|
+
colored_sentence
|
22
|
+
}.join("\n")
|
data/bin/frequency-list
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
4
|
+
require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
|
5
|
+
|
6
|
+
text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text}
|
7
|
+
|
8
|
+
frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
|
9
|
+
|
10
|
+
frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
|
11
|
+
|
12
|
+
def output frequencies
|
13
|
+
STDOUT.sync = true
|
14
|
+
frequencies.each{|k,v| puts "#{v.size}\t#{k}\t#{v[0,6].join(",")}#{v.size > 6 ? "..." : ""}"}
|
15
|
+
end
|
16
|
+
|
17
|
+
if ARGV[0] == "--unknown"
|
18
|
+
output frequencies.find_all{|k,v| !VocabularyChest::is_known?(v[0])}
|
19
|
+
else
|
20
|
+
output frequencies
|
21
|
+
end
|
data/bin/lookup
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
|
4
|
+
require File.join(File.dirname(__FILE__), '../lib/lookup' )
|
5
|
+
|
6
|
+
def analyse text
|
7
|
+
words = text.split(" ")
|
8
|
+
known = words.select{|w| VocabularyChest.is_known? w}
|
9
|
+
unknown = (words - known)
|
10
|
+
return [known.map{|w| VocabularyChest::sanitize w}.uniq, unknown.map{|w| VocabularyChest::sanitize w}.uniq]
|
11
|
+
end
|
12
|
+
|
13
|
+
def output options
|
14
|
+
known, unknown = options
|
15
|
+
size = known.size + unknown.size
|
16
|
+
|
17
|
+
puts
|
18
|
+
puts "--"
|
19
|
+
puts "UNKNOWN WORDS: #{unknown.join(", ")}"
|
20
|
+
puts
|
21
|
+
puts "DEFINITIONS"
|
22
|
+
puts Lookup::go(unknown)
|
23
|
+
puts "--"
|
24
|
+
puts
|
25
|
+
puts "Total number of unknown words: #{unknown.size}"
|
26
|
+
puts "Total number of known words: #{known.size}"
|
27
|
+
puts "Total number of words: #{size}"
|
28
|
+
puts "Percentage of words known: #{'%.2f' % (known.size.to_f / size * 100)}%"
|
29
|
+
end
|
30
|
+
|
31
|
+
if !ARGV.empty?
|
32
|
+
ARGV.each {|filename|
|
33
|
+
text = File.open(filename,'r'){|file| file.read}
|
34
|
+
puts "#{filename}:"
|
35
|
+
output(analyse(text))
|
36
|
+
}
|
37
|
+
else
|
38
|
+
text = STDIN.read
|
39
|
+
output(analyse(text))
|
40
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/game' )
|
4
|
+
|
5
|
+
def get_input
|
6
|
+
if !ARGV.empty?
|
7
|
+
else
|
8
|
+
STDIN.read
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
(puts "Usage: #{$0} <file with words to practice> <file with examples>"; exit(1)) if ARGV.size < 2
|
13
|
+
|
14
|
+
input = File.open(ARGV.shift){|f| f.read}
|
15
|
+
words = input.split("\n").uniq
|
16
|
+
|
17
|
+
example_sentences = []
|
18
|
+
ARGV.each{|filename| example_sentences += File.open(filename).readlines}
|
19
|
+
example_sentences.map!{|s| s.chomp}
|
20
|
+
example_sentences.reject!{|s| words.find{|w| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
|
21
|
+
words.reject!{|w| example_sentences.find{|s| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
|
22
|
+
|
23
|
+
puts "Playing with #{example_sentences.size} sentences and #{words.size} words."
|
24
|
+
|
25
|
+
Game.new(words).play{|word|
|
26
|
+
sentence = example_sentences.shuffle.find{|s| (s =~ /\b#{Regexp.escape(word)}\b/i) != nil}
|
27
|
+
[sentence, $&]
|
28
|
+
}
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '../lib/document-cache' )
|
4
|
+
require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
|
5
|
+
require File.join(File.dirname(__FILE__), '../lib/game' )
|
6
|
+
|
7
|
+
def get_input
|
8
|
+
if !ARGV.empty?
|
9
|
+
File.open(ARGV[0]){|f| f.read}
|
10
|
+
else
|
11
|
+
STDIN.read
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
input = get_input
|
16
|
+
words = input.split("\n")
|
17
|
+
words.reject!{|w| STDOUT.write("."); STDOUT.flush; DocumentCache.find_examples_for(w).empty?}
|
18
|
+
puts
|
19
|
+
|
20
|
+
Game.new(words).play{ |word|
|
21
|
+
matches = DocumentCache.find_examples_for(word, 10).keys
|
22
|
+
sentence = matches.sort{|a, b| a.size <=> b.size}.first
|
23
|
+
correct_answer = DocumentCache::extract_matching_words(word, sentence).first
|
24
|
+
[sentence, correct_answer]
|
25
|
+
}
|
data/bin/prepare-text
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
text = STDIN.read
|
4
|
+
File.open("/tmp/prepared-text", 'w'){|f| f.write(text)}
|
5
|
+
exec("classify-new-words /tmp/prepared-text && \
|
6
|
+
cache-document /tmp/prepared-text && \
|
7
|
+
echo '\nREADABILITY STATISTICS' && \
|
8
|
+
readability-of /tmp/prepared-text && \
|
9
|
+
percentage-known-of /tmp/prepared-text")
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'amatch'
|
5
|
+
|
6
|
+
def distance w1, w2
|
7
|
+
Amatch::Levenshtein.new(w1).match(w2)
|
8
|
+
end
|
9
|
+
|
10
|
+
def analyse text, known_text
|
11
|
+
words = words_of(text)
|
12
|
+
known_words = words_of(known_text)
|
13
|
+
|
14
|
+
words.map {|w|
|
15
|
+
closest_word, proximity = find_closest_word(w, known_words)
|
16
|
+
puts "#{w}\t#{closest_word}\t#{proximity}"
|
17
|
+
STDOUT.flush
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_closest_word word, known_words
|
22
|
+
closest_word = known_words.sort{|w1, w2| distance(w1, word) <=> distance(w2, word)}.first
|
23
|
+
|
24
|
+
[closest_word, distance(closest_word, word)]
|
25
|
+
end
|
26
|
+
|
27
|
+
def words_of text
|
28
|
+
words = text.split(" ").uniq
|
29
|
+
end
|
30
|
+
|
31
|
+
if ARGV.size < 2
|
32
|
+
puts "usage: ./script <new text> <known text>"
|
33
|
+
exit 1
|
34
|
+
end
|
35
|
+
|
36
|
+
filename = ARGV.shift
|
37
|
+
text = File.open(filename,'r'){|file| file.read}
|
38
|
+
|
39
|
+
known_text ||= ""
|
40
|
+
ARGV.each {|filename|
|
41
|
+
known_text += File.open(filename,'r'){|file| file.read}
|
42
|
+
}
|
43
|
+
|
44
|
+
analyse text, known_text
|
data/bin/readability-of
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
def analyse text
|
4
|
+
words = text.split(" ").size.to_f
|
5
|
+
sentences = text.split(/\.|\?|!/).reject{|s| s.strip.empty?}.size.to_f
|
6
|
+
syllables = text.split(" ").inject([]){|sum, w| sum + vowels(w)}
|
7
|
+
syllables = syllables.size.to_f * 0.9 # for silent vowels
|
8
|
+
words_with_more_than_three_syllables = text.split(" ").select{|w| vowels(w).size >= 3}
|
9
|
+
ms = words_with_more_than_three_syllables.size.to_f / text.split(" ").size.to_f * 100
|
10
|
+
|
11
|
+
stats = {:words => words,
|
12
|
+
:sentences => sentences,
|
13
|
+
:syllables => syllables,
|
14
|
+
:ms => ms,
|
15
|
+
:wiener_sachtextformel => wiener_sachtextformel(sentences, words, ms),
|
16
|
+
:grade_level => grade(sentences, words, syllables)}
|
17
|
+
end
|
18
|
+
|
19
|
+
def wiener_sachtextformel sentences, words, ms
|
20
|
+
0.2656 * (words / sentences) + 0.2744 * ms -1.693
|
21
|
+
end
|
22
|
+
|
23
|
+
def grade sentences, words, syllables
|
24
|
+
(0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59)
|
25
|
+
end
|
26
|
+
|
27
|
+
def vowels w
|
28
|
+
w.split(/b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|z/i).reject{|s|s.empty?}
|
29
|
+
end
|
30
|
+
|
31
|
+
def output options
|
32
|
+
puts
|
33
|
+
if !options[:source].nil?
|
34
|
+
puts "#{options[:source]}:"
|
35
|
+
end
|
36
|
+
puts "Number of sentences: #{options[:sentences]}"
|
37
|
+
puts "Number of words: #{options[:words]}"
|
38
|
+
puts "Number of syllabes: #{options[:syllables]}"
|
39
|
+
puts "Average number of syllables per word: #{ '%.2f' % (options[:syllables] / options[:words])}"
|
40
|
+
puts "Average number of words per sentence: #{'%.2f' % (options[:words] / options[:sentences])}"
|
41
|
+
puts "Wiener Sachtextformel: #{'%.2f' % options[:wiener_sachtextformel]}"
|
42
|
+
puts "Flesch-Kincaid Grade Level: #{'%.2f' % options[:grade_level]}"
|
43
|
+
puts
|
44
|
+
end
|
45
|
+
|
46
|
+
if !ARGV.empty?
|
47
|
+
ARGV.each {|filename|
|
48
|
+
text = File.open(filename,'r'){|file| file.read}
|
49
|
+
output(analyse(text).merge(:source => filename))
|
50
|
+
}
|
51
|
+
else
|
52
|
+
text = STDIN.read
|
53
|
+
output(analyse(text))
|
54
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
command =<<EOF
|
4
|
+
|
5
|
+
total_occurrences=`expr $(frequency-list | awk '{print $1}' | xargs | sed 's/ / + /g')`
|
6
|
+
unknown_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | xargs | sed 's/ / + /g')`
|
7
|
+
next_500_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | head -500 | xargs | sed 's/ / + /g')`
|
8
|
+
|
9
|
+
echo Total occurrences: $total_occurrences
|
10
|
+
echo Unknown occurrences: $unknown_occurrences
|
11
|
+
echo Your current vocabulary knowledge covers $(echo "scale=2;($total_occurrences - $unknown_occurrences) / $total_occurrences * 100" | bc -q)% of all occurrences
|
12
|
+
echo The next 500 words will bring your cover to $(echo "scale=2;($total_occurrences - $unknown_occurrences + $next_500_occurrences) / $total_occurrences * 100" | bc -q)%
|
13
|
+
|
14
|
+
EOF
|
15
|
+
|
16
|
+
system command
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'fileutils.rb'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'uuid'
|
4
|
+
|
5
|
+
require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
|
6
|
+
|
7
|
+
CACHE_DIR = "#{ROOT_DIR}/docs"
|
8
|
+
|
9
|
+
FileUtils::mkdir_p(ROOT_DIR)
|
10
|
+
FileUtils::mkdir_p(CACHE_DIR)
|
11
|
+
|
12
|
+
module DocumentCache
|
13
|
+
def self.add search
|
14
|
+
filename = "#{CACHE_DIR}/#{UUID.new.generate}"
|
15
|
+
File.open(filename,'w'){|f| f.write(search)}
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.find_matches_by_stemming search, sentences
|
19
|
+
token = VocabularyChest::stem(search)
|
20
|
+
sentences.inject({}){|hash, s|
|
21
|
+
words = s.split(" ")
|
22
|
+
found = words.select{|w| VocabularyChest::stem(w) == token}
|
23
|
+
hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty?
|
24
|
+
hash
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.find_matches_by_grepping search, sentences
|
29
|
+
sentences.inject({}){|hash, s|
|
30
|
+
hash[clean(s)] = [search] if s.include? search
|
31
|
+
hash
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.find_matches_in filenames, search, count
|
36
|
+
matches = {}
|
37
|
+
|
38
|
+
[:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
|
39
|
+
filenames.each {|filename|
|
40
|
+
File.open(filename){|file|
|
41
|
+
contents = file.read
|
42
|
+
sentences = contents.split(/[\.?!\n]/)
|
43
|
+
matches.merge!(self.send(matcher, search, sentences))
|
44
|
+
|
45
|
+
matches.shift until matches.size <= count if matches.size > count
|
46
|
+
return matches if matches.size == count
|
47
|
+
}
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
matches
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.documents
|
55
|
+
Dir["#{CACHE_DIR}/*"]
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.find_examples_for search, count=1
|
59
|
+
find_matches_in documents, search, count
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.clean(sentence)
|
63
|
+
sentence.strip + "."
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.extract_matching_words search, sentence
|
67
|
+
matches = find_matches_by_stemming(search, [sentence])
|
68
|
+
return matches.values.first if !matches.empty?
|
69
|
+
return find_matches_by_grepping(search, [sentence]).values.first
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.frequency_list
|
73
|
+
text = ""
|
74
|
+
documents.each{|f| text += File.open(f).read }
|
75
|
+
counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
|
76
|
+
counts.reject!{|word, count| count < 2}
|
77
|
+
counts.sort_by {|k,v| v}.reverse
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.stemmed_frequency_list
|
81
|
+
text = ""
|
82
|
+
documents.each{|f| text += File.open(f).read }
|
83
|
+
stems = text.split(" ").map{|w| VocabularyChest::stem(w)}
|
84
|
+
counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
|
85
|
+
counts.reject!{|stem, count| count < 2}
|
86
|
+
counts.sort_by {|k,v| v}.reverse
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
if __FILE__ == $0
|
91
|
+
puts "The document cache contains #{DocumentCache.documents.size} documents."
|
92
|
+
puts
|
93
|
+
puts "Here are the 10 most frequent stems:"
|
94
|
+
DocumentCache.stemmed_frequency_list[0,10].each{|stem, count| puts "#{count} #{stem}"}
|
95
|
+
puts
|
96
|
+
puts "Here are the 10 most frequent words:"
|
97
|
+
DocumentCache.frequency_list[0,10].each{|word, count| puts "#{count} #{word}"}
|
98
|
+
end
|
data/lib/game.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'amatch'
|
3
|
+
require 'colorize'
|
4
|
+
|
5
|
+
class Game
|
6
|
+
def initialize words
|
7
|
+
@words = words
|
8
|
+
@results = []
|
9
|
+
@turn = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def pick_choices_for word
|
13
|
+
others = @words.reject{|w| w == word}
|
14
|
+
choices = (others.shuffle[0,4] + [word])
|
15
|
+
choices.shuffle
|
16
|
+
end
|
17
|
+
|
18
|
+
def hit_rate
|
19
|
+
number_of_turns_we_remember = [@words.size, @turn].min
|
20
|
+
recent_results = number_of_turns_we_remember < @results.size ? @results[@results.size - number_of_turns_we_remember, @results.size] : @results
|
21
|
+
hits = recent_results.inject(0){|sum, value| sum+=value; sum}
|
22
|
+
misses = recent_results.size - hits
|
23
|
+
|
24
|
+
rate = 100 - misses / number_of_turns_we_remember.to_f * 100
|
25
|
+
rate >= 0 ? rate : 0
|
26
|
+
end
|
27
|
+
|
28
|
+
def romanize w
|
29
|
+
w.gsub("ä", "ae").gsub("ö", "oe").gsub("ü", "ue").gsub("ß", "ss")
|
30
|
+
end
|
31
|
+
|
32
|
+
def proximity one, other
|
33
|
+
Amatch::Levenshtein.new(romanize(one).downcase).match(romanize(other).downcase)
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_an_answer_for correct_answer
|
37
|
+
STDOUT.write("> ")
|
38
|
+
answer = STDIN.gets.chomp
|
39
|
+
|
40
|
+
while answer != '?' and proximity(answer, correct_answer) > 1
|
41
|
+
@results << 0
|
42
|
+
display_definition(answer)
|
43
|
+
puts
|
44
|
+
puts "Nope. Try again.".red
|
45
|
+
puts
|
46
|
+
STDOUT.write("> ")
|
47
|
+
answer = STDIN.gets.chomp
|
48
|
+
end
|
49
|
+
|
50
|
+
answer
|
51
|
+
end
|
52
|
+
|
53
|
+
def fetch_definition word
|
54
|
+
definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
|
55
|
+
definitions.uniq.join(" -- ")
|
56
|
+
end
|
57
|
+
|
58
|
+
def display_definition word
|
59
|
+
definition = fetch_definition(word)
|
60
|
+
puts "\n#{word.blue} means: #{definition}" if !definition.empty?
|
61
|
+
end
|
62
|
+
|
63
|
+
def show_question_for word, sentence
|
64
|
+
puts
|
65
|
+
puts
|
66
|
+
puts
|
67
|
+
puts "Turn #{@turn}".blue
|
68
|
+
puts "#{@words.size} words".blue
|
69
|
+
color = hit_rate < 75 ? :red : (hit_rate < 90 ? :yellow : 'green')
|
70
|
+
puts "Hit rate: #{'%i' % hit_rate}%".send(color)
|
71
|
+
puts "------------------------------------------------------------".blue
|
72
|
+
puts
|
73
|
+
puts ((sentence =~ /\b#{Regexp.escape(word)}\b/i) != nil ? sentence.gsub(/\b#{Regexp.escape(word)}\b/i, "______") : sentence.gsub(word, "______")).strip
|
74
|
+
puts
|
75
|
+
puts "Choices: [" + " #{pick_choices_for(word).join(" - ")} ".blue + "]"
|
76
|
+
puts
|
77
|
+
end
|
78
|
+
|
79
|
+
def end_game
|
80
|
+
puts
|
81
|
+
puts
|
82
|
+
puts "#{"Congratulations!".green} Here is a star for you: #{'*'.yellow}"
|
83
|
+
puts
|
84
|
+
end
|
85
|
+
|
86
|
+
def respond_to_answer answer, correct_answer
|
87
|
+
puts
|
88
|
+
if answer == '?'
|
89
|
+
@results << 0
|
90
|
+
puts "The answer was: #{correct_answer.red}."
|
91
|
+
display_definition(correct_answer)
|
92
|
+
sleep 2
|
93
|
+
elsif proximity(answer, correct_answer) > 0
|
94
|
+
@results << 1
|
95
|
+
puts "Sort of... the answer was: #{correct_answer.yellow}."
|
96
|
+
display_definition(correct_answer)
|
97
|
+
sleep 1
|
98
|
+
else
|
99
|
+
@results << 1
|
100
|
+
puts "Correct! The answer was: #{correct_answer.green}."
|
101
|
+
display_definition(correct_answer)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def play &block
|
106
|
+
@words.shuffle.each{|word|
|
107
|
+
@turn += 1
|
108
|
+
|
109
|
+
sentence, correct_answer = yield word
|
110
|
+
(puts "Could not find anything to play with for word #{word}."; next) if sentence.nil? or correct_answer.nil?
|
111
|
+
|
112
|
+
show_question_for(correct_answer, sentence)
|
113
|
+
answer = get_an_answer_for(correct_answer)
|
114
|
+
respond_to_answer(answer, correct_answer)
|
115
|
+
|
116
|
+
(end_game; return) if @turn >= @words.size and (hit_rate >= 95)
|
117
|
+
}
|
118
|
+
|
119
|
+
play(&block)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
data/lib/lookup.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
module Lookup
|
2
|
+
|
3
|
+
def self.fetch_definition word
|
4
|
+
definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
|
5
|
+
definitions.uniq.join(" -- ")
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.sanitize word
|
9
|
+
word.gsub(/[,\.]/,"")
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.go words
|
13
|
+
words.map{|w| sanitize w}.map{|w| "#{w}\t#{fetch_definition w}"}.join("\n")
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'fileutils.rb'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'lingua/stemmer'
|
5
|
+
|
6
|
+
ROOT_DIR = File.expand_path(ENV['chest_location'] || "~/.vocabulary-chest")
|
7
|
+
KNOWN_FILE = "#{ROOT_DIR}/known"
|
8
|
+
UNKNOWN_FILE = "#{ROOT_DIR}/unknown"
|
9
|
+
|
10
|
+
FileUtils::mkdir_p(ROOT_DIR)
|
11
|
+
FileUtils.touch(KNOWN_FILE)
|
12
|
+
FileUtils.touch(UNKNOWN_FILE)
|
13
|
+
|
14
|
+
module VocabularyChest
|
15
|
+
@known_file = File.open(KNOWN_FILE,'a')
|
16
|
+
@unknown_file = File.open(UNKNOWN_FILE,'a')
|
17
|
+
@known_words = nil
|
18
|
+
@unknown_words = nil
|
19
|
+
@stemmer= Lingua::Stemmer.new(:language => "de")
|
20
|
+
|
21
|
+
at_exit {@known_file.close}
|
22
|
+
at_exit {@unknown_file.close}
|
23
|
+
|
24
|
+
def self.known_words
|
25
|
+
@known_words ||= File.open(KNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.unknown_words
|
29
|
+
@unknown_words ||= File.open(UNKNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.add_to_known_words word
|
33
|
+
@known_file.puts(stem word)
|
34
|
+
@known_file.flush
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.add_to_unknown_words word
|
38
|
+
@unknown_file.puts(stem word)
|
39
|
+
@unknown_file.flush
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.contains? word
|
43
|
+
stemmed_word = stem word
|
44
|
+
known_words.include?(stemmed_word) or unknown_words.include?(stemmed_word)
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.is_known? word
|
48
|
+
known_words.include?(stem(word)) or sanitize(word).empty? or (sanitize(word) =~ /^[-\d]*$/) != nil
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.stem word
|
52
|
+
@stemmer.stem(sanitize word).downcase
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.sanitize word
|
56
|
+
word.gsub(/[,\"\.:;()?!„“]/,"")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
if __FILE__ == $0
|
61
|
+
known = VocabularyChest::known_words
|
62
|
+
unknown = VocabularyChest::unknown_words
|
63
|
+
puts "The chest contains #{known.size} known words."
|
64
|
+
end
|
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text-analysis-utils
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 9
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: "0.1"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Matt
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2012-06-21 00:00:00 Z
|
18
|
+
dependencies: []
|
19
|
+
|
20
|
+
description:
|
21
|
+
email:
|
22
|
+
executables:
|
23
|
+
- cache-document
|
24
|
+
- classify-new-words
|
25
|
+
- find-examples-for
|
26
|
+
- frequency-list
|
27
|
+
- lookup
|
28
|
+
- percentage-known-of
|
29
|
+
- play-with-blanks
|
30
|
+
- play-with-examples
|
31
|
+
- prepare-text
|
32
|
+
- proximity-of-words
|
33
|
+
- readability-of
|
34
|
+
- vocabulary-coverage
|
35
|
+
extensions: []
|
36
|
+
|
37
|
+
extra_rdoc_files: []
|
38
|
+
|
39
|
+
files:
|
40
|
+
- lib/text-analysis-utils.rb
|
41
|
+
- lib/document-cache.rb
|
42
|
+
- lib/vocabulary-chest.rb
|
43
|
+
- lib/game.rb
|
44
|
+
- lib/lookup.rb
|
45
|
+
- bin/cache-document
|
46
|
+
- bin/classify-new-words
|
47
|
+
- bin/find-examples-for
|
48
|
+
- bin/frequency-list
|
49
|
+
- bin/lookup
|
50
|
+
- bin/percentage-known-of
|
51
|
+
- bin/play-with-blanks
|
52
|
+
- bin/play-with-examples
|
53
|
+
- bin/prepare-text
|
54
|
+
- bin/proximity-of-words
|
55
|
+
- bin/readability-of
|
56
|
+
- bin/vocabulary-coverage
|
57
|
+
homepage: http://github.com/matstc/text-analysis-utils
|
58
|
+
licenses: []
|
59
|
+
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
hash: 57
|
71
|
+
segments:
|
72
|
+
- 1
|
73
|
+
- 8
|
74
|
+
- 7
|
75
|
+
version: 1.8.7
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
none: false
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
hash: 3
|
82
|
+
segments:
|
83
|
+
- 0
|
84
|
+
version: "0"
|
85
|
+
requirements: []
|
86
|
+
|
87
|
+
rubyforge_project:
|
88
|
+
rubygems_version: 1.8.15
|
89
|
+
signing_key:
|
90
|
+
specification_version: 3
|
91
|
+
summary: Utilities to help language learners
|
92
|
+
test_files: []
|
93
|
+
|