text-analysis-utils 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
+
5
+ def get_text
6
+ if ARGV.empty?
7
+ STDIN.read
8
+ else
9
+ text = ""
10
+ ARGV.each{|filename| File.open(filename){|file| text += file.read}}
11
+ text
12
+ end
13
+ end
14
+
15
+ DocumentCache.add(get_text)
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby1.9.3
2
+
3
+ require 'rubygems'
4
+ require 'colorize'
5
+ require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
6
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
7
+
8
+
9
+ def get_text
10
+ if !ARGV.empty?
11
+ text = ""
12
+ ARGV.each {|filename|
13
+ text += File.open(filename, 'r'){|file| file.read}
14
+ }
15
+ text
16
+ else
17
+ STDIN.read
18
+ end
19
+ end
20
+
21
+ def ask word, index, words, text
22
+ location = (text =~ /\b#{Regexp.escape(word)}\b/)
23
+ puts "!!!!" if word == "notwendig"
24
+ location = text.index(word) if location.nil?
25
+ (puts "Skipping word: #{word}"; return 'skip') if location.nil?
26
+
27
+ beginning_of_snippet = location - 15 < 0 ? 0 : location - 15
28
+ snippet = text[beginning_of_snippet, 30 + word.size].gsub(/[\r\n]/," ")
29
+
30
+ answer = ''
31
+ while !['y','n','skip'].include?(answer)
32
+ puts
33
+ puts "------------------------------------------------------------"
34
+ puts "...#{snippet.gsub(word, word.green)}..."
35
+ puts
36
+ puts "Do you know this word? [y or n or skip] (#{index + 1} of #{words.size})"
37
+ STDOUT.write("> ")
38
+
39
+ answer = STDIN.gets.gsub(/[\r\n]/, '')
40
+ end
41
+ answer
42
+ end
43
+
44
+ def collect_words_from textual_words
45
+ textual_words.reject!{|w|VocabularyChest::is_known?(w)}
46
+ textual_words.reject!{|w|VocabularyChest::contains?(w)} if @@options.include? "-n"
47
+
48
+ words_by_stem = textual_words.inject({}){|hash, w| hash[VocabularyChest::stem w] = w; hash}
49
+ words_by_stem.reject!{|s,w| (s =~ /^[-\d\s,\.]*$/) == 0}
50
+
51
+ words_by_stem.values.uniq
52
+ end
53
+
54
+ @@options = ARGV.select{|arg| ["-n"].include? arg}
55
+ ARGV.reject!{|arg| @@options.include? arg}
56
+
57
+ text = get_text
58
+ textual_words = text.split(" ").collect{|w| w.chomp}
59
+ puts "Thanks. Please wait..."
60
+
61
+ words = collect_words_from textual_words
62
+ words.each_with_index {|word, index|
63
+ match = DocumentCache::extract_matching_words(word, text).first
64
+ answer = ask match, index, words, text
65
+ VocabularyChest::add_to_known_words(word) if answer == 'y'
66
+ VocabularyChest::add_to_unknown_words(word) if answer == 'n'
67
+ }
68
+
69
+ puts "Done."
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'colorize'
5
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
6
+
7
+ count = 1
8
+ count_param = ARGV.find{|a| (a =~ /--\d*/) == 0}
9
+ if !count_param.nil?
10
+ count = count_param.sub("--","").to_i
11
+ ARGV.reject!{|a| a == count_param}
12
+ end
13
+
14
+ search = ARGV.join(" ")
15
+ matches = DocumentCache.find_examples_for search, count
16
+ exit(1) if matches.empty?
17
+
18
+ puts matches.map{|sentence, tokens|
19
+ colored_sentence = sentence.dup
20
+ tokens.each{|m| colored_sentence.gsub!(m, m.green) }
21
+ colored_sentence
22
+ }.join("\n")
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
+ require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
5
+
6
+ text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text}
7
+
8
+ frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
9
+
10
+ frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
11
+
12
+ def output frequencies
13
+ STDOUT.sync = true
14
+ frequencies.each{|k,v| puts "#{v.size}\t#{k}\t#{v[0,6].join(",")}#{v.size > 6 ? "..." : ""}"}
15
+ end
16
+
17
+ if ARGV[0] == "--unknown"
18
+ output frequencies.find_all{|k,v| !VocabularyChest::is_known?(v[0])}
19
+ else
20
+ output frequencies
21
+ end
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/lookup' )
4
+
5
+ if !ARGV.empty?
6
+ puts Lookup::go ARGV
7
+ else
8
+ puts Lookup::go STDIN.read.split("\n")
9
+ end
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
4
+ require File.join(File.dirname(__FILE__), '../lib/lookup' )
5
+
6
+ def analyse text
7
+ words = text.split(" ")
8
+ known = words.select{|w| VocabularyChest.is_known? w}
9
+ unknown = (words - known)
10
+ return [known.map{|w| VocabularyChest::sanitize w}.uniq, unknown.map{|w| VocabularyChest::sanitize w}.uniq]
11
+ end
12
+
13
+ def output options
14
+ known, unknown = options
15
+ size = known.size + unknown.size
16
+
17
+ puts
18
+ puts "--"
19
+ puts "UNKNOWN WORDS: #{unknown.join(", ")}"
20
+ puts
21
+ puts "DEFINITIONS"
22
+ puts Lookup::go(unknown)
23
+ puts "--"
24
+ puts
25
+ puts "Total number of unknown words: #{unknown.size}"
26
+ puts "Total number of known words: #{known.size}"
27
+ puts "Total number of words: #{size}"
28
+ puts "Percentage of words known: #{'%.2f' % (known.size.to_f / size * 100)}%"
29
+ end
30
+
31
+ if !ARGV.empty?
32
+ ARGV.each {|filename|
33
+ text = File.open(filename,'r'){|file| file.read}
34
+ puts "#{filename}:"
35
+ output(analyse(text))
36
+ }
37
+ else
38
+ text = STDIN.read
39
+ output(analyse(text))
40
+ end
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/game' )
4
+
5
+ def get_input
6
+ if !ARGV.empty?
7
+ else
8
+ STDIN.read
9
+ end
10
+ end
11
+
12
+ (puts "Usage: #{$0} <file with words to practice> <file with examples>"; exit(1)) if ARGV.size < 2
13
+
14
+ input = File.open(ARGV.shift){|f| f.read}
15
+ words = input.split("\n").uniq
16
+
17
+ example_sentences = []
18
+ ARGV.each{|filename| example_sentences += File.open(filename).readlines}
19
+ example_sentences.map!{|s| s.chomp}
20
+ example_sentences.reject!{|s| words.find{|w| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
21
+ words.reject!{|w| example_sentences.find{|s| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
22
+
23
+ puts "Playing with #{example_sentences.size} sentences and #{words.size} words."
24
+
25
+ Game.new(words).play{|word|
26
+ sentence = example_sentences.shuffle.find{|s| (s =~ /\b#{Regexp.escape(word)}\b/i) != nil}
27
+ [sentence, $&]
28
+ }
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
+ require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
5
+ require File.join(File.dirname(__FILE__), '../lib/game' )
6
+
7
+ def get_input
8
+ if !ARGV.empty?
9
+ File.open(ARGV[0]){|f| f.read}
10
+ else
11
+ STDIN.read
12
+ end
13
+ end
14
+
15
+ input = get_input
16
+ words = input.split("\n")
17
+ words.reject!{|w| STDOUT.write("."); STDOUT.flush; DocumentCache.find_examples_for(w).empty?}
18
+ puts
19
+
20
+ Game.new(words).play{ |word|
21
+ matches = DocumentCache.find_examples_for(word, 10).keys
22
+ sentence = matches.sort{|a, b| a.size <=> b.size}.first
23
+ correct_answer = DocumentCache::extract_matching_words(word, sentence).first
24
+ [sentence, correct_answer]
25
+ }
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ text = STDIN.read
4
+ File.open("/tmp/prepared-text", 'w'){|f| f.write(text)}
5
+ exec("classify-new-words /tmp/prepared-text && \
6
+ cache-document /tmp/prepared-text && \
7
+ echo '\nREADABILITY STATISTICS' && \
8
+ readability-of /tmp/prepared-text && \
9
+ percentage-known-of /tmp/prepared-text")
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'amatch'
5
+
6
+ def distance w1, w2
7
+ Amatch::Levenshtein.new(w1).match(w2)
8
+ end
9
+
10
+ def analyse text, known_text
11
+ words = words_of(text)
12
+ known_words = words_of(known_text)
13
+
14
+ words.map {|w|
15
+ closest_word, proximity = find_closest_word(w, known_words)
16
+ puts "#{w}\t#{closest_word}\t#{proximity}"
17
+ STDOUT.flush
18
+ }
19
+ end
20
+
21
+ def find_closest_word word, known_words
22
+ closest_word = known_words.sort{|w1, w2| distance(w1, word) <=> distance(w2, word)}.first
23
+
24
+ [closest_word, distance(closest_word, word)]
25
+ end
26
+
27
+ def words_of text
28
+ words = text.split(" ").uniq
29
+ end
30
+
31
+ if ARGV.size < 2
32
+ puts "usage: ./script <new text> <known text>"
33
+ exit 1
34
+ end
35
+
36
+ filename = ARGV.shift
37
+ text = File.open(filename,'r'){|file| file.read}
38
+
39
+ known_text ||= ""
40
+ ARGV.each {|filename|
41
+ known_text += File.open(filename,'r'){|file| file.read}
42
+ }
43
+
44
+ analyse text, known_text
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ def analyse text
4
+ words = text.split(" ").size.to_f
5
+ sentences = text.split(/\.|\?|!/).reject{|s| s.strip.empty?}.size.to_f
6
+ syllables = text.split(" ").inject([]){|sum, w| sum + vowels(w)}
7
+ syllables = syllables.size.to_f * 0.9 # for silent vowels
8
+ words_with_more_than_three_syllables = text.split(" ").select{|w| vowels(w).size >= 3}
9
+ ms = words_with_more_than_three_syllables.size.to_f / text.split(" ").size.to_f * 100
10
+
11
+ stats = {:words => words,
12
+ :sentences => sentences,
13
+ :syllables => syllables,
14
+ :ms => ms,
15
+ :wiener_sachtextformel => wiener_sachtextformel(sentences, words, ms),
16
+ :grade_level => grade(sentences, words, syllables)}
17
+ end
18
+
19
+ def wiener_sachtextformel sentences, words, ms
20
+ 0.2656 * (words / sentences) + 0.2744 * ms -1.693
21
+ end
22
+
23
+ def grade sentences, words, syllables
24
+ (0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59)
25
+ end
26
+
27
+ def vowels w
28
+ w.split(/b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|z/i).reject{|s|s.empty?}
29
+ end
30
+
31
+ def output options
32
+ puts
33
+ if !options[:source].nil?
34
+ puts "#{options[:source]}:"
35
+ end
36
+ puts "Number of sentences: #{options[:sentences]}"
37
+ puts "Number of words: #{options[:words]}"
38
+ puts "Number of syllabes: #{options[:syllables]}"
39
+ puts "Average number of syllables per word: #{ '%.2f' % (options[:syllables] / options[:words])}"
40
+ puts "Average number of words per sentence: #{'%.2f' % (options[:words] / options[:sentences])}"
41
+ puts "Wiener Sachtextformel: #{'%.2f' % options[:wiener_sachtextformel]}"
42
+ puts "Flesch-Kincaid Grade Level: #{'%.2f' % options[:grade_level]}"
43
+ puts
44
+ end
45
+
46
+ if !ARGV.empty?
47
+ ARGV.each {|filename|
48
+ text = File.open(filename,'r'){|file| file.read}
49
+ output(analyse(text).merge(:source => filename))
50
+ }
51
+ else
52
+ text = STDIN.read
53
+ output(analyse(text))
54
+ end
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ command =<<EOF
4
+
5
+ total_occurrences=`expr $(frequency-list | awk '{print $1}' | xargs | sed 's/ / + /g')`
6
+ unknown_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | xargs | sed 's/ / + /g')`
7
+ next_500_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | head -500 | xargs | sed 's/ / + /g')`
8
+
9
+ echo Total occurrences: $total_occurrences
10
+ echo Unknown occurrences: $unknown_occurrences
11
+ echo Your current vocabulary knowledge covers $(echo "scale=2;($total_occurrences - $unknown_occurrences) / $total_occurrences * 100" | bc -q)% of all occurrences
12
+ echo The next 500 words will bring your cover to $(echo "scale=2;($total_occurrences - $unknown_occurrences + $next_500_occurrences) / $total_occurrences * 100" | bc -q)%
13
+
14
+ EOF
15
+
16
+ system command
@@ -0,0 +1,98 @@
1
+ require 'fileutils.rb'
2
+ require 'rubygems'
3
+ require 'uuid'
4
+
5
+ require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
6
+
7
+ CACHE_DIR = "#{ROOT_DIR}/docs"
8
+
9
+ FileUtils::mkdir_p(ROOT_DIR)
10
+ FileUtils::mkdir_p(CACHE_DIR)
11
+
12
+ module DocumentCache
13
+ def self.add search
14
+ filename = "#{CACHE_DIR}/#{UUID.new.generate}"
15
+ File.open(filename,'w'){|f| f.write(search)}
16
+ end
17
+
18
+ def self.find_matches_by_stemming search, sentences
19
+ token = VocabularyChest::stem(search)
20
+ sentences.inject({}){|hash, s|
21
+ words = s.split(" ")
22
+ found = words.select{|w| VocabularyChest::stem(w) == token}
23
+ hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty?
24
+ hash
25
+ }
26
+ end
27
+
28
+ def self.find_matches_by_grepping search, sentences
29
+ sentences.inject({}){|hash, s|
30
+ hash[clean(s)] = [search] if s.include? search
31
+ hash
32
+ }
33
+ end
34
+
35
+ def self.find_matches_in filenames, search, count
36
+ matches = {}
37
+
38
+ [:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
39
+ filenames.each {|filename|
40
+ File.open(filename){|file|
41
+ contents = file.read
42
+ sentences = contents.split(/[\.?!\n]/)
43
+ matches.merge!(self.send(matcher, search, sentences))
44
+
45
+ matches.shift until matches.size <= count if matches.size > count
46
+ return matches if matches.size == count
47
+ }
48
+ }
49
+ }
50
+
51
+ matches
52
+ end
53
+
54
+ def self.documents
55
+ Dir["#{CACHE_DIR}/*"]
56
+ end
57
+
58
+ def self.find_examples_for search, count=1
59
+ find_matches_in documents, search, count
60
+ end
61
+
62
+ def self.clean(sentence)
63
+ sentence.strip + "."
64
+ end
65
+
66
+ def self.extract_matching_words search, sentence
67
+ matches = find_matches_by_stemming(search, [sentence])
68
+ return matches.values.first if !matches.empty?
69
+ return find_matches_by_grepping(search, [sentence]).values.first
70
+ end
71
+
72
+ def self.frequency_list
73
+ text = ""
74
+ documents.each{|f| text += File.open(f).read }
75
+ counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
76
+ counts.reject!{|word, count| count < 2}
77
+ counts.sort_by {|k,v| v}.reverse
78
+ end
79
+
80
+ def self.stemmed_frequency_list
81
+ text = ""
82
+ documents.each{|f| text += File.open(f).read }
83
+ stems = text.split(" ").map{|w| VocabularyChest::stem(w)}
84
+ counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
85
+ counts.reject!{|stem, count| count < 2}
86
+ counts.sort_by {|k,v| v}.reverse
87
+ end
88
+ end
89
+
90
+ if __FILE__ == $0
91
+ puts "The document cache contains #{DocumentCache.documents.size} documents."
92
+ puts
93
+ puts "Here are the 10 most frequent stems:"
94
+ DocumentCache.stemmed_frequency_list[0,10].each{|stem, count| puts "#{count} #{stem}"}
95
+ puts
96
+ puts "Here are the 10 most frequent words:"
97
+ DocumentCache.frequency_list[0,10].each{|word, count| puts "#{count} #{word}"}
98
+ end
@@ -0,0 +1,122 @@
1
+ require 'rubygems'
2
+ require 'amatch'
3
+ require 'colorize'
4
+
5
+ class Game
6
+ def initialize words
7
+ @words = words
8
+ @results = []
9
+ @turn = 0
10
+ end
11
+
12
+ def pick_choices_for word
13
+ others = @words.reject{|w| w == word}
14
+ choices = (others.shuffle[0,4] + [word])
15
+ choices.shuffle
16
+ end
17
+
18
+ def hit_rate
19
+ number_of_turns_we_remember = [@words.size, @turn].min
20
+ recent_results = number_of_turns_we_remember < @results.size ? @results[@results.size - number_of_turns_we_remember, @results.size] : @results
21
+ hits = recent_results.inject(0){|sum, value| sum+=value; sum}
22
+ misses = recent_results.size - hits
23
+
24
+ rate = 100 - misses / number_of_turns_we_remember.to_f * 100
25
+ rate >= 0 ? rate : 0
26
+ end
27
+
28
+ def romanize w
29
+ w.gsub("ä", "ae").gsub("ö", "oe").gsub("ü", "ue").gsub("ß", "ss")
30
+ end
31
+
32
+ def proximity one, other
33
+ Amatch::Levenshtein.new(romanize(one).downcase).match(romanize(other).downcase)
34
+ end
35
+
36
+ def get_an_answer_for correct_answer
37
+ STDOUT.write("> ")
38
+ answer = STDIN.gets.chomp
39
+
40
+ while answer != '?' and proximity(answer, correct_answer) > 1
41
+ @results << 0
42
+ display_definition(answer)
43
+ puts
44
+ puts "Nope. Try again.".red
45
+ puts
46
+ STDOUT.write("> ")
47
+ answer = STDIN.gets.chomp
48
+ end
49
+
50
+ answer
51
+ end
52
+
53
+ def fetch_definition word
54
+ definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
55
+ definitions.uniq.join(" -- ")
56
+ end
57
+
58
+ def display_definition word
59
+ definition = fetch_definition(word)
60
+ puts "\n#{word.blue} means: #{definition}" if !definition.empty?
61
+ end
62
+
63
+ def show_question_for word, sentence
64
+ puts
65
+ puts
66
+ puts
67
+ puts "Turn #{@turn}".blue
68
+ puts "#{@words.size} words".blue
69
+ color = hit_rate < 75 ? :red : (hit_rate < 90 ? :yellow : 'green')
70
+ puts "Hit rate: #{'%i' % hit_rate}%".send(color)
71
+ puts "------------------------------------------------------------".blue
72
+ puts
73
+ puts ((sentence =~ /\b#{Regexp.escape(word)}\b/i) != nil ? sentence.gsub(/\b#{Regexp.escape(word)}\b/i, "______") : sentence.gsub(word, "______")).strip
74
+ puts
75
+ puts "Choices: [" + " #{pick_choices_for(word).join(" - ")} ".blue + "]"
76
+ puts
77
+ end
78
+
79
+ def end_game
80
+ puts
81
+ puts
82
+ puts "#{"Congratulations!".green} Here is a star for you: #{'*'.yellow}"
83
+ puts
84
+ end
85
+
86
+ def respond_to_answer answer, correct_answer
87
+ puts
88
+ if answer == '?'
89
+ @results << 0
90
+ puts "The answer was: #{correct_answer.red}."
91
+ display_definition(correct_answer)
92
+ sleep 2
93
+ elsif proximity(answer, correct_answer) > 0
94
+ @results << 1
95
+ puts "Sort of... the answer was: #{correct_answer.yellow}."
96
+ display_definition(correct_answer)
97
+ sleep 1
98
+ else
99
+ @results << 1
100
+ puts "Correct! The answer was: #{correct_answer.green}."
101
+ display_definition(correct_answer)
102
+ end
103
+ end
104
+
105
+ def play &block
106
+ @words.shuffle.each{|word|
107
+ @turn += 1
108
+
109
+ sentence, correct_answer = yield word
110
+ (puts "Could not find anything to play with for word #{word}."; next) if sentence.nil? or correct_answer.nil?
111
+
112
+ show_question_for(correct_answer, sentence)
113
+ answer = get_an_answer_for(correct_answer)
114
+ respond_to_answer(answer, correct_answer)
115
+
116
+ (end_game; return) if @turn >= @words.size and (hit_rate >= 95)
117
+ }
118
+
119
+ play(&block)
120
+ end
121
+ end
122
+
@@ -0,0 +1,15 @@
1
+ module Lookup
2
+
3
+ def self.fetch_definition word
4
+ definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
5
+ definitions.uniq.join(" -- ")
6
+ end
7
+
8
+ def self.sanitize word
9
+ word.gsub(/[,\.]/,"")
10
+ end
11
+
12
+ def self.go words
13
+ words.map{|w| sanitize w}.map{|w| "#{w}\t#{fetch_definition w}"}.join("\n")
14
+ end
15
+ end
@@ -0,0 +1,4 @@
1
+ require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
2
+ require File.join(File.dirname(__FILE__), 'document-cache' )
3
+ require File.join(File.dirname(__FILE__), 'game' )
4
+
@@ -0,0 +1,64 @@
1
+ # encoding: utf-8
2
+ require 'fileutils.rb'
3
+ require 'rubygems'
4
+ require 'lingua/stemmer'
5
+
6
+ ROOT_DIR = File.expand_path(ENV['chest_location'] || "~/.vocabulary-chest")
7
+ KNOWN_FILE = "#{ROOT_DIR}/known"
8
+ UNKNOWN_FILE = "#{ROOT_DIR}/unknown"
9
+
10
+ FileUtils::mkdir_p(ROOT_DIR)
11
+ FileUtils.touch(KNOWN_FILE)
12
+ FileUtils.touch(UNKNOWN_FILE)
13
+
14
+ module VocabularyChest
15
+ @known_file = File.open(KNOWN_FILE,'a')
16
+ @unknown_file = File.open(UNKNOWN_FILE,'a')
17
+ @known_words = nil
18
+ @unknown_words = nil
19
+ @stemmer= Lingua::Stemmer.new(:language => "de")
20
+
21
+ at_exit {@known_file.close}
22
+ at_exit {@unknown_file.close}
23
+
24
+ def self.known_words
25
+ @known_words ||= File.open(KNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
26
+ end
27
+
28
+ def self.unknown_words
29
+ @unknown_words ||= File.open(UNKNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
30
+ end
31
+
32
+ def self.add_to_known_words word
33
+ @known_file.puts(stem word)
34
+ @known_file.flush
35
+ end
36
+
37
+ def self.add_to_unknown_words word
38
+ @unknown_file.puts(stem word)
39
+ @unknown_file.flush
40
+ end
41
+
42
+ def self.contains? word
43
+ stemmed_word = stem word
44
+ known_words.include?(stemmed_word) or unknown_words.include?(stemmed_word)
45
+ end
46
+
47
+ def self.is_known? word
48
+ known_words.include?(stem(word)) or sanitize(word).empty? or (sanitize(word) =~ /^[-\d]*$/) != nil
49
+ end
50
+
51
+ def self.stem word
52
+ @stemmer.stem(sanitize word).downcase
53
+ end
54
+
55
+ def self.sanitize word
56
+ word.gsub(/[,\"\.:;()?!„“]/,"")
57
+ end
58
+ end
59
+
60
+ if __FILE__ == $0
61
+ known = VocabularyChest::known_words
62
+ unknown = VocabularyChest::unknown_words
63
+ puts "The chest contains #{known.size} known words."
64
+ end
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text-analysis-utils
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - Matt
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2012-06-21 00:00:00 Z
18
+ dependencies: []
19
+
20
+ description:
21
+ email:
22
+ executables:
23
+ - cache-document
24
+ - classify-new-words
25
+ - find-examples-for
26
+ - frequency-list
27
+ - lookup
28
+ - percentage-known-of
29
+ - play-with-blanks
30
+ - play-with-examples
31
+ - prepare-text
32
+ - proximity-of-words
33
+ - readability-of
34
+ - vocabulary-coverage
35
+ extensions: []
36
+
37
+ extra_rdoc_files: []
38
+
39
+ files:
40
+ - lib/text-analysis-utils.rb
41
+ - lib/document-cache.rb
42
+ - lib/vocabulary-chest.rb
43
+ - lib/game.rb
44
+ - lib/lookup.rb
45
+ - bin/cache-document
46
+ - bin/classify-new-words
47
+ - bin/find-examples-for
48
+ - bin/frequency-list
49
+ - bin/lookup
50
+ - bin/percentage-known-of
51
+ - bin/play-with-blanks
52
+ - bin/play-with-examples
53
+ - bin/prepare-text
54
+ - bin/proximity-of-words
55
+ - bin/readability-of
56
+ - bin/vocabulary-coverage
57
+ homepage: http://github.com/matstc/text-analysis-utils
58
+ licenses: []
59
+
60
+ post_install_message:
61
+ rdoc_options: []
62
+
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ hash: 57
71
+ segments:
72
+ - 1
73
+ - 8
74
+ - 7
75
+ version: 1.8.7
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ hash: 3
82
+ segments:
83
+ - 0
84
+ version: "0"
85
+ requirements: []
86
+
87
+ rubyforge_project:
88
+ rubygems_version: 1.8.15
89
+ signing_key:
90
+ specification_version: 3
91
+ summary: Utilities to help language learners
92
+ test_files: []
93
+