text-analysis-utils 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
+
5
+ def get_text
6
+ if ARGV.empty?
7
+ STDIN.read
8
+ else
9
+ text = ""
10
+ ARGV.each{|filename| File.open(filename){|file| text += file.read}}
11
+ text
12
+ end
13
+ end
14
+
15
+ DocumentCache.add(get_text)
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby1.9.3
2
+
3
+ require 'rubygems'
4
+ require 'colorize'
5
+ require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
6
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
7
+
8
+
9
+ def get_text
10
+ if !ARGV.empty?
11
+ text = ""
12
+ ARGV.each {|filename|
13
+ text += File.open(filename, 'r'){|file| file.read}
14
+ }
15
+ text
16
+ else
17
+ STDIN.read
18
+ end
19
+ end
20
+
21
+ def ask word, index, words, text
22
+ location = (text =~ /\b#{Regexp.escape(word)}\b/)
23
+ puts "!!!!" if word == "notwendig"
24
+ location = text.index(word) if location.nil?
25
+ (puts "Skipping word: #{word}"; return 'skip') if location.nil?
26
+
27
+ beginning_of_snippet = location - 15 < 0 ? 0 : location - 15
28
+ snippet = text[beginning_of_snippet, 30 + word.size].gsub(/[\r\n]/," ")
29
+
30
+ answer = ''
31
+ while !['y','n','skip'].include?(answer)
32
+ puts
33
+ puts "------------------------------------------------------------"
34
+ puts "...#{snippet.gsub(word, word.green)}..."
35
+ puts
36
+ puts "Do you know this word? [y or n or skip] (#{index + 1} of #{words.size})"
37
+ STDOUT.write("> ")
38
+
39
+ answer = STDIN.gets.gsub(/[\r\n]/, '')
40
+ end
41
+ answer
42
+ end
43
+
44
+ def collect_words_from textual_words
45
+ textual_words.reject!{|w|VocabularyChest::is_known?(w)}
46
+ textual_words.reject!{|w|VocabularyChest::contains?(w)} if @@options.include? "-n"
47
+
48
+ words_by_stem = textual_words.inject({}){|hash, w| hash[VocabularyChest::stem w] = w; hash}
49
+ words_by_stem.reject!{|s,w| (s =~ /^[-\d\s,\.]*$/) == 0}
50
+
51
+ words_by_stem.values.uniq
52
+ end
53
+
54
+ @@options = ARGV.select{|arg| ["-n"].include? arg}
55
+ ARGV.reject!{|arg| @@options.include? arg}
56
+
57
+ text = get_text
58
+ textual_words = text.split(" ").collect{|w| w.chomp}
59
+ puts "Thanks. Please wait..."
60
+
61
+ words = collect_words_from textual_words
62
+ words.each_with_index {|word, index|
63
+ match = DocumentCache::extract_matching_words(word, text).first
64
+ answer = ask match, index, words, text
65
+ VocabularyChest::add_to_known_words(word) if answer == 'y'
66
+ VocabularyChest::add_to_unknown_words(word) if answer == 'n'
67
+ }
68
+
69
+ puts "Done."
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'colorize'
5
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
6
+
7
+ count = 1
8
+ count_param = ARGV.find{|a| (a =~ /--\d*/) == 0}
9
+ if !count_param.nil?
10
+ count = count_param.sub("--","").to_i
11
+ ARGV.reject!{|a| a == count_param}
12
+ end
13
+
14
+ search = ARGV.join(" ")
15
+ matches = DocumentCache.find_examples_for search, count
16
+ exit(1) if matches.empty?
17
+
18
+ puts matches.map{|sentence, tokens|
19
+ colored_sentence = sentence.dup
20
+ tokens.each{|m| colored_sentence.gsub!(m, m.green) }
21
+ colored_sentence
22
+ }.join("\n")
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
+ require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
5
+
6
+ text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text}
7
+
8
+ frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
9
+
10
+ frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
11
+
12
+ def output frequencies
13
+ STDOUT.sync = true
14
+ frequencies.each{|k,v| puts "#{v.size}\t#{k}\t#{v[0,6].join(",")}#{v.size > 6 ? "..." : ""}"}
15
+ end
16
+
17
+ if ARGV[0] == "--unknown"
18
+ output frequencies.find_all{|k,v| !VocabularyChest::is_known?(v[0])}
19
+ else
20
+ output frequencies
21
+ end
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/lookup' )
4
+
5
+ if !ARGV.empty?
6
+ puts Lookup::go ARGV
7
+ else
8
+ puts Lookup::go STDIN.read.split("\n")
9
+ end
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
4
+ require File.join(File.dirname(__FILE__), '../lib/lookup' )
5
+
6
+ def analyse text
7
+ words = text.split(" ")
8
+ known = words.select{|w| VocabularyChest.is_known? w}
9
+ unknown = (words - known)
10
+ return [known.map{|w| VocabularyChest::sanitize w}.uniq, unknown.map{|w| VocabularyChest::sanitize w}.uniq]
11
+ end
12
+
13
+ def output options
14
+ known, unknown = options
15
+ size = known.size + unknown.size
16
+
17
+ puts
18
+ puts "--"
19
+ puts "UNKNOWN WORDS: #{unknown.join(", ")}"
20
+ puts
21
+ puts "DEFINITIONS"
22
+ puts Lookup::go(unknown)
23
+ puts "--"
24
+ puts
25
+ puts "Total number of unknown words: #{unknown.size}"
26
+ puts "Total number of known words: #{known.size}"
27
+ puts "Total number of words: #{size}"
28
+ puts "Percentage of words known: #{'%.2f' % (known.size.to_f / size * 100)}%"
29
+ end
30
+
31
+ if !ARGV.empty?
32
+ ARGV.each {|filename|
33
+ text = File.open(filename,'r'){|file| file.read}
34
+ puts "#{filename}:"
35
+ output(analyse(text))
36
+ }
37
+ else
38
+ text = STDIN.read
39
+ output(analyse(text))
40
+ end
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/game' )
4
+
5
+ def get_input
6
+ if !ARGV.empty?
7
+ else
8
+ STDIN.read
9
+ end
10
+ end
11
+
12
+ (puts "Usage: #{$0} <file with words to practice> <file with examples>"; exit(1)) if ARGV.size < 2
13
+
14
+ input = File.open(ARGV.shift){|f| f.read}
15
+ words = input.split("\n").uniq
16
+
17
+ example_sentences = []
18
+ ARGV.each{|filename| example_sentences += File.open(filename).readlines}
19
+ example_sentences.map!{|s| s.chomp}
20
+ example_sentences.reject!{|s| words.find{|w| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
21
+ words.reject!{|w| example_sentences.find{|s| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
22
+
23
+ puts "Playing with #{example_sentences.size} sentences and #{words.size} words."
24
+
25
+ Game.new(words).play{|word|
26
+ sentence = example_sentences.shuffle.find{|s| (s =~ /\b#{Regexp.escape(word)}\b/i) != nil}
27
+ [sentence, $&]
28
+ }
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
+ require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
5
+ require File.join(File.dirname(__FILE__), '../lib/game' )
6
+
7
+ def get_input
8
+ if !ARGV.empty?
9
+ File.open(ARGV[0]){|f| f.read}
10
+ else
11
+ STDIN.read
12
+ end
13
+ end
14
+
15
+ input = get_input
16
+ words = input.split("\n")
17
+ words.reject!{|w| STDOUT.write("."); STDOUT.flush; DocumentCache.find_examples_for(w).empty?}
18
+ puts
19
+
20
+ Game.new(words).play{ |word|
21
+ matches = DocumentCache.find_examples_for(word, 10).keys
22
+ sentence = matches.sort{|a, b| a.size <=> b.size}.first
23
+ correct_answer = DocumentCache::extract_matching_words(word, sentence).first
24
+ [sentence, correct_answer]
25
+ }
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ text = STDIN.read
4
+ File.open("/tmp/prepared-text", 'w'){|f| f.write(text)}
5
+ exec("classify-new-words /tmp/prepared-text && \
6
+ cache-document /tmp/prepared-text && \
7
+ echo '\nREADABILITY STATISTICS' && \
8
+ readability-of /tmp/prepared-text && \
9
+ percentage-known-of /tmp/prepared-text")
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'amatch'
5
+
6
+ def distance w1, w2
7
+ Amatch::Levenshtein.new(w1).match(w2)
8
+ end
9
+
10
+ def analyse text, known_text
11
+ words = words_of(text)
12
+ known_words = words_of(known_text)
13
+
14
+ words.map {|w|
15
+ closest_word, proximity = find_closest_word(w, known_words)
16
+ puts "#{w}\t#{closest_word}\t#{proximity}"
17
+ STDOUT.flush
18
+ }
19
+ end
20
+
21
+ def find_closest_word word, known_words
22
+ closest_word = known_words.sort{|w1, w2| distance(w1, word) <=> distance(w2, word)}.first
23
+
24
+ [closest_word, distance(closest_word, word)]
25
+ end
26
+
27
+ def words_of text
28
+ words = text.split(" ").uniq
29
+ end
30
+
31
+ if ARGV.size < 2
32
+ puts "usage: ./script <new text> <known text>"
33
+ exit 1
34
+ end
35
+
36
+ filename = ARGV.shift
37
+ text = File.open(filename,'r'){|file| file.read}
38
+
39
+ known_text ||= ""
40
+ ARGV.each {|filename|
41
+ known_text += File.open(filename,'r'){|file| file.read}
42
+ }
43
+
44
+ analyse text, known_text
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ def analyse text
4
+ words = text.split(" ").size.to_f
5
+ sentences = text.split(/\.|\?|!/).reject{|s| s.strip.empty?}.size.to_f
6
+ syllables = text.split(" ").inject([]){|sum, w| sum + vowels(w)}
7
+ syllables = syllables.size.to_f * 0.9 # for silent vowels
8
+ words_with_more_than_three_syllables = text.split(" ").select{|w| vowels(w).size >= 3}
9
+ ms = words_with_more_than_three_syllables.size.to_f / text.split(" ").size.to_f * 100
10
+
11
+ stats = {:words => words,
12
+ :sentences => sentences,
13
+ :syllables => syllables,
14
+ :ms => ms,
15
+ :wiener_sachtextformel => wiener_sachtextformel(sentences, words, ms),
16
+ :grade_level => grade(sentences, words, syllables)}
17
+ end
18
+
19
+ def wiener_sachtextformel sentences, words, ms
20
+ 0.2656 * (words / sentences) + 0.2744 * ms -1.693
21
+ end
22
+
23
+ def grade sentences, words, syllables
24
+ (0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59)
25
+ end
26
+
27
+ def vowels w
28
+ w.split(/b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|z/i).reject{|s|s.empty?}
29
+ end
30
+
31
+ def output options
32
+ puts
33
+ if !options[:source].nil?
34
+ puts "#{options[:source]}:"
35
+ end
36
+ puts "Number of sentences: #{options[:sentences]}"
37
+ puts "Number of words: #{options[:words]}"
38
+ puts "Number of syllabes: #{options[:syllables]}"
39
+ puts "Average number of syllables per word: #{ '%.2f' % (options[:syllables] / options[:words])}"
40
+ puts "Average number of words per sentence: #{'%.2f' % (options[:words] / options[:sentences])}"
41
+ puts "Wiener Sachtextformel: #{'%.2f' % options[:wiener_sachtextformel]}"
42
+ puts "Flesch-Kincaid Grade Level: #{'%.2f' % options[:grade_level]}"
43
+ puts
44
+ end
45
+
46
+ if !ARGV.empty?
47
+ ARGV.each {|filename|
48
+ text = File.open(filename,'r'){|file| file.read}
49
+ output(analyse(text).merge(:source => filename))
50
+ }
51
+ else
52
+ text = STDIN.read
53
+ output(analyse(text))
54
+ end
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ command =<<EOF
4
+
5
+ total_occurrences=`expr $(frequency-list | awk '{print $1}' | xargs | sed 's/ / + /g')`
6
+ unknown_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | xargs | sed 's/ / + /g')`
7
+ next_500_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | head -500 | xargs | sed 's/ / + /g')`
8
+
9
+ echo Total occurrences: $total_occurrences
10
+ echo Unknown occurrences: $unknown_occurrences
11
+ echo Your current vocabulary knowledge covers $(echo "scale=2;($total_occurrences - $unknown_occurrences) / $total_occurrences * 100" | bc -q)% of all occurrences
12
+ echo The next 500 words will bring your cover to $(echo "scale=2;($total_occurrences - $unknown_occurrences + $next_500_occurrences) / $total_occurrences * 100" | bc -q)%
13
+
14
+ EOF
15
+
16
+ system command
@@ -0,0 +1,98 @@
1
+ require 'fileutils.rb'
2
+ require 'rubygems'
3
+ require 'uuid'
4
+
5
+ require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
6
+
7
+ CACHE_DIR = "#{ROOT_DIR}/docs"
8
+
9
+ FileUtils::mkdir_p(ROOT_DIR)
10
+ FileUtils::mkdir_p(CACHE_DIR)
11
+
12
+ module DocumentCache
13
+ def self.add search
14
+ filename = "#{CACHE_DIR}/#{UUID.new.generate}"
15
+ File.open(filename,'w'){|f| f.write(search)}
16
+ end
17
+
18
+ def self.find_matches_by_stemming search, sentences
19
+ token = VocabularyChest::stem(search)
20
+ sentences.inject({}){|hash, s|
21
+ words = s.split(" ")
22
+ found = words.select{|w| VocabularyChest::stem(w) == token}
23
+ hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty?
24
+ hash
25
+ }
26
+ end
27
+
28
+ def self.find_matches_by_grepping search, sentences
29
+ sentences.inject({}){|hash, s|
30
+ hash[clean(s)] = [search] if s.include? search
31
+ hash
32
+ }
33
+ end
34
+
35
+ def self.find_matches_in filenames, search, count
36
+ matches = {}
37
+
38
+ [:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
39
+ filenames.each {|filename|
40
+ File.open(filename){|file|
41
+ contents = file.read
42
+ sentences = contents.split(/[\.?!\n]/)
43
+ matches.merge!(self.send(matcher, search, sentences))
44
+
45
+ matches.shift until matches.size <= count if matches.size > count
46
+ return matches if matches.size == count
47
+ }
48
+ }
49
+ }
50
+
51
+ matches
52
+ end
53
+
54
+ def self.documents
55
+ Dir["#{CACHE_DIR}/*"]
56
+ end
57
+
58
+ def self.find_examples_for search, count=1
59
+ find_matches_in documents, search, count
60
+ end
61
+
62
+ def self.clean(sentence)
63
+ sentence.strip + "."
64
+ end
65
+
66
+ def self.extract_matching_words search, sentence
67
+ matches = find_matches_by_stemming(search, [sentence])
68
+ return matches.values.first if !matches.empty?
69
+ return find_matches_by_grepping(search, [sentence]).values.first
70
+ end
71
+
72
+ def self.frequency_list
73
+ text = ""
74
+ documents.each{|f| text += File.open(f).read }
75
+ counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
76
+ counts.reject!{|word, count| count < 2}
77
+ counts.sort_by {|k,v| v}.reverse
78
+ end
79
+
80
+ def self.stemmed_frequency_list
81
+ text = ""
82
+ documents.each{|f| text += File.open(f).read }
83
+ stems = text.split(" ").map{|w| VocabularyChest::stem(w)}
84
+ counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
85
+ counts.reject!{|stem, count| count < 2}
86
+ counts.sort_by {|k,v| v}.reverse
87
+ end
88
+ end
89
+
90
+ if __FILE__ == $0
91
+ puts "The document cache contains #{DocumentCache.documents.size} documents."
92
+ puts
93
+ puts "Here are the 10 most frequent stems:"
94
+ DocumentCache.stemmed_frequency_list[0,10].each{|stem, count| puts "#{count} #{stem}"}
95
+ puts
96
+ puts "Here are the 10 most frequent words:"
97
+ DocumentCache.frequency_list[0,10].each{|word, count| puts "#{count} #{word}"}
98
+ end
@@ -0,0 +1,122 @@
1
+ require 'rubygems'
2
+ require 'amatch'
3
+ require 'colorize'
4
+
5
+ class Game
6
+ def initialize words
7
+ @words = words
8
+ @results = []
9
+ @turn = 0
10
+ end
11
+
12
+ def pick_choices_for word
13
+ others = @words.reject{|w| w == word}
14
+ choices = (others.shuffle[0,4] + [word])
15
+ choices.shuffle
16
+ end
17
+
18
+ def hit_rate
19
+ number_of_turns_we_remember = [@words.size, @turn].min
20
+ recent_results = number_of_turns_we_remember < @results.size ? @results[@results.size - number_of_turns_we_remember, @results.size] : @results
21
+ hits = recent_results.inject(0){|sum, value| sum+=value; sum}
22
+ misses = recent_results.size - hits
23
+
24
+ rate = 100 - misses / number_of_turns_we_remember.to_f * 100
25
+ rate >= 0 ? rate : 0
26
+ end
27
+
28
+ def romanize w
29
+ w.gsub("ä", "ae").gsub("ö", "oe").gsub("ü", "ue").gsub("ß", "ss")
30
+ end
31
+
32
+ def proximity one, other
33
+ Amatch::Levenshtein.new(romanize(one).downcase).match(romanize(other).downcase)
34
+ end
35
+
36
+ def get_an_answer_for correct_answer
37
+ STDOUT.write("> ")
38
+ answer = STDIN.gets.chomp
39
+
40
+ while answer != '?' and proximity(answer, correct_answer) > 1
41
+ @results << 0
42
+ display_definition(answer)
43
+ puts
44
+ puts "Nope. Try again.".red
45
+ puts
46
+ STDOUT.write("> ")
47
+ answer = STDIN.gets.chomp
48
+ end
49
+
50
+ answer
51
+ end
52
+
53
+ def fetch_definition word
54
+ definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
55
+ definitions.uniq.join(" -- ")
56
+ end
57
+
58
+ def display_definition word
59
+ definition = fetch_definition(word)
60
+ puts "\n#{word.blue} means: #{definition}" if !definition.empty?
61
+ end
62
+
63
+ def show_question_for word, sentence
64
+ puts
65
+ puts
66
+ puts
67
+ puts "Turn #{@turn}".blue
68
+ puts "#{@words.size} words".blue
69
+ color = hit_rate < 75 ? :red : (hit_rate < 90 ? :yellow : 'green')
70
+ puts "Hit rate: #{'%i' % hit_rate}%".send(color)
71
+ puts "------------------------------------------------------------".blue
72
+ puts
73
+ puts ((sentence =~ /\b#{Regexp.escape(word)}\b/i) != nil ? sentence.gsub(/\b#{Regexp.escape(word)}\b/i, "______") : sentence.gsub(word, "______")).strip
74
+ puts
75
+ puts "Choices: [" + " #{pick_choices_for(word).join(" - ")} ".blue + "]"
76
+ puts
77
+ end
78
+
79
+ def end_game
80
+ puts
81
+ puts
82
+ puts "#{"Congratulations!".green} Here is a star for you: #{'*'.yellow}"
83
+ puts
84
+ end
85
+
86
+ def respond_to_answer answer, correct_answer
87
+ puts
88
+ if answer == '?'
89
+ @results << 0
90
+ puts "The answer was: #{correct_answer.red}."
91
+ display_definition(correct_answer)
92
+ sleep 2
93
+ elsif proximity(answer, correct_answer) > 0
94
+ @results << 1
95
+ puts "Sort of... the answer was: #{correct_answer.yellow}."
96
+ display_definition(correct_answer)
97
+ sleep 1
98
+ else
99
+ @results << 1
100
+ puts "Correct! The answer was: #{correct_answer.green}."
101
+ display_definition(correct_answer)
102
+ end
103
+ end
104
+
105
+ def play &block
106
+ @words.shuffle.each{|word|
107
+ @turn += 1
108
+
109
+ sentence, correct_answer = yield word
110
+ (puts "Could not find anything to play with for word #{word}."; next) if sentence.nil? or correct_answer.nil?
111
+
112
+ show_question_for(correct_answer, sentence)
113
+ answer = get_an_answer_for(correct_answer)
114
+ respond_to_answer(answer, correct_answer)
115
+
116
+ (end_game; return) if @turn >= @words.size and (hit_rate >= 95)
117
+ }
118
+
119
+ play(&block)
120
+ end
121
+ end
122
+
@@ -0,0 +1,15 @@
1
+ module Lookup
2
+
3
+ def self.fetch_definition word
4
+ definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
5
+ definitions.uniq.join(" -- ")
6
+ end
7
+
8
+ def self.sanitize word
9
+ word.gsub(/[,\.]/,"")
10
+ end
11
+
12
+ def self.go words
13
+ words.map{|w| sanitize w}.map{|w| "#{w}\t#{fetch_definition w}"}.join("\n")
14
+ end
15
+ end
@@ -0,0 +1,4 @@
1
+ require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
2
+ require File.join(File.dirname(__FILE__), 'document-cache' )
3
+ require File.join(File.dirname(__FILE__), 'game' )
4
+
@@ -0,0 +1,64 @@
1
+ # encoding: utf-8
2
+ require 'fileutils.rb'
3
+ require 'rubygems'
4
+ require 'lingua/stemmer'
5
+
6
+ ROOT_DIR = File.expand_path(ENV['chest_location'] || "~/.vocabulary-chest")
7
+ KNOWN_FILE = "#{ROOT_DIR}/known"
8
+ UNKNOWN_FILE = "#{ROOT_DIR}/unknown"
9
+
10
+ FileUtils::mkdir_p(ROOT_DIR)
11
+ FileUtils.touch(KNOWN_FILE)
12
+ FileUtils.touch(UNKNOWN_FILE)
13
+
14
+ module VocabularyChest
15
+ @known_file = File.open(KNOWN_FILE,'a')
16
+ @unknown_file = File.open(UNKNOWN_FILE,'a')
17
+ @known_words = nil
18
+ @unknown_words = nil
19
+ @stemmer= Lingua::Stemmer.new(:language => "de")
20
+
21
+ at_exit {@known_file.close}
22
+ at_exit {@unknown_file.close}
23
+
24
+ def self.known_words
25
+ @known_words ||= File.open(KNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
26
+ end
27
+
28
+ def self.unknown_words
29
+ @unknown_words ||= File.open(UNKNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
30
+ end
31
+
32
+ def self.add_to_known_words word
33
+ @known_file.puts(stem word)
34
+ @known_file.flush
35
+ end
36
+
37
+ def self.add_to_unknown_words word
38
+ @unknown_file.puts(stem word)
39
+ @unknown_file.flush
40
+ end
41
+
42
+ def self.contains? word
43
+ stemmed_word = stem word
44
+ known_words.include?(stemmed_word) or unknown_words.include?(stemmed_word)
45
+ end
46
+
47
+ def self.is_known? word
48
+ known_words.include?(stem(word)) or sanitize(word).empty? or (sanitize(word) =~ /^[-\d]*$/) != nil
49
+ end
50
+
51
+ def self.stem word
52
+ @stemmer.stem(sanitize word).downcase
53
+ end
54
+
55
+ def self.sanitize word
56
+ word.gsub(/[,\"\.:;()?!„“]/,"")
57
+ end
58
+ end
59
+
60
+ if __FILE__ == $0
61
+ known = VocabularyChest::known_words
62
+ unknown = VocabularyChest::unknown_words
63
+ puts "The chest contains #{known.size} known words."
64
+ end
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text-analysis-utils
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - Matt
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2012-06-21 00:00:00 Z
18
+ dependencies: []
19
+
20
+ description:
21
+ email:
22
+ executables:
23
+ - cache-document
24
+ - classify-new-words
25
+ - find-examples-for
26
+ - frequency-list
27
+ - lookup
28
+ - percentage-known-of
29
+ - play-with-blanks
30
+ - play-with-examples
31
+ - prepare-text
32
+ - proximity-of-words
33
+ - readability-of
34
+ - vocabulary-coverage
35
+ extensions: []
36
+
37
+ extra_rdoc_files: []
38
+
39
+ files:
40
+ - lib/text-analysis-utils.rb
41
+ - lib/document-cache.rb
42
+ - lib/vocabulary-chest.rb
43
+ - lib/game.rb
44
+ - lib/lookup.rb
45
+ - bin/cache-document
46
+ - bin/classify-new-words
47
+ - bin/find-examples-for
48
+ - bin/frequency-list
49
+ - bin/lookup
50
+ - bin/percentage-known-of
51
+ - bin/play-with-blanks
52
+ - bin/play-with-examples
53
+ - bin/prepare-text
54
+ - bin/proximity-of-words
55
+ - bin/readability-of
56
+ - bin/vocabulary-coverage
57
+ homepage: http://github.com/matstc/text-analysis-utils
58
+ licenses: []
59
+
60
+ post_install_message:
61
+ rdoc_options: []
62
+
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ hash: 57
71
+ segments:
72
+ - 1
73
+ - 8
74
+ - 7
75
+ version: 1.8.7
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ hash: 3
82
+ segments:
83
+ - 0
84
+ version: "0"
85
+ requirements: []
86
+
87
+ rubyforge_project:
88
+ rubygems_version: 1.8.15
89
+ signing_key:
90
+ specification_version: 3
91
+ summary: Utilities to help language learners
92
+ test_files: []
93
+