text-analysis-utils 0.5.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/cache-document +4 -1
- data/bin/classify-new-words +13 -8
- data/bin/find-examples-for +6 -3
- data/bin/frequency-list +9 -5
- data/bin/percentage-known-of +10 -4
- data/bin/play-with-examples +9 -3
- data/bin/readability-of +11 -20
- data/bin/set-text-language +12 -0
- data/bin/vocabulary-size +5 -1
- data/lib/algorithms.rb +18 -0
- data/lib/cli.rb +22 -0
- data/lib/document-cache.rb +23 -15
- data/lib/game.rb +1 -1
- data/lib/tau_config.rb +20 -0
- data/lib/version.rb +3 -0
- data/lib/vocabulary-chest.rb +15 -13
- metadata +9 -6
- data/bin/proximity-of-words +0 -43
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8aa34da03ff05968e5a80084e9202357dccef15d
|
4
|
+
data.tar.gz: a84652578851e4a52b8e7ba3cd77ad948f9e6681
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f37a401ef683f0ece40434a943cafecc6502a7ddef853adaea0f4fe3af90beb643d04c1f3c926301c97757b95d20b546a0c9c7e371748563ca1d0b4033de393a
|
7
|
+
data.tar.gz: 881ccba2a66db59b3c8d21595398dcd9dadb2d4e249268aaf202c040578d5d8b0be36d15ce7768cfa7e3b0d24bf81a5e153728822c5dd9c7a286ef668d394456
|
data/bin/cache-document
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require_relative '../lib/cli'
|
3
4
|
require_relative '../lib/document-cache'
|
4
5
|
|
6
|
+
TAU::CLI.intercept help: "Pass in file names in argument to cache their contents. Or call this script without arguments to paste text. Use Control-D when you are done pasting."
|
7
|
+
|
5
8
|
def get_text
|
6
9
|
if ARGV.empty?
|
7
10
|
STDIN.read
|
@@ -12,4 +15,4 @@ def get_text
|
|
12
15
|
end
|
13
16
|
end
|
14
17
|
|
15
|
-
DocumentCache.add(get_text)
|
18
|
+
DocumentCache.new.add(get_text)
|
data/bin/classify-new-words
CHANGED
@@ -4,7 +4,9 @@ require 'rubygems'
|
|
4
4
|
require 'colorize'
|
5
5
|
require_relative '../lib/vocabulary-chest'
|
6
6
|
require_relative '../lib/document-cache'
|
7
|
+
require_relative '../lib/cli'
|
7
8
|
|
9
|
+
TAU::CLI.intercept help: "Pass in file names in argument to classify their content. Or call this script without arguments to paste words to classify. Use Control-D when you are done pasting.\n\nYou can use the switch \"-n\" to skip any word that is already in your vocabulary chest."
|
8
10
|
|
9
11
|
def get_text
|
10
12
|
if !ARGV.empty?
|
@@ -18,6 +20,9 @@ def get_text
|
|
18
20
|
end
|
19
21
|
end
|
20
22
|
|
23
|
+
@cache = DocumentCache.new
|
24
|
+
@chest = VocabularyChest.new
|
25
|
+
|
21
26
|
def ask word, index, words, text
|
22
27
|
location = (text =~ /\b#{Regexp.escape(word)}\b/)
|
23
28
|
location = text.index(word) if location.nil?
|
@@ -41,17 +46,17 @@ def ask word, index, words, text
|
|
41
46
|
end
|
42
47
|
|
43
48
|
def collect_words_from textual_words
|
44
|
-
textual_words.reject!{|w
|
45
|
-
textual_words.reject!{|w
|
49
|
+
textual_words.reject!{|w|@chest.is_known?(w)}
|
50
|
+
textual_words.reject!{|w|@chest.contains?(w)} if @options.include? "-n"
|
46
51
|
|
47
|
-
words_by_stem = textual_words.inject({}){|hash, w| hash[
|
52
|
+
words_by_stem = textual_words.inject({}){|hash, w| hash[@chest.stem w] = w; hash}
|
48
53
|
words_by_stem.reject!{|s,w| (s =~ /^[-\d\s,\.]*$/) == 0}
|
49
54
|
|
50
55
|
words_by_stem.values.uniq
|
51
56
|
end
|
52
57
|
|
53
|
-
|
54
|
-
ARGV.reject!{|arg|
|
58
|
+
@options = ARGV.select{|arg| ["-n"].include? arg}
|
59
|
+
ARGV.reject!{|arg| @options.include? arg}
|
55
60
|
|
56
61
|
text = get_text
|
57
62
|
textual_words = text.split(" ").collect{|w| w.chomp}
|
@@ -59,10 +64,10 @@ puts "Thanks. Please wait..."
|
|
59
64
|
|
60
65
|
words = collect_words_from textual_words
|
61
66
|
words.each_with_index {|word, index|
|
62
|
-
match =
|
67
|
+
match = @cache.extract_matching_words(word, text).first
|
63
68
|
answer = ask match, index, words, text
|
64
|
-
|
65
|
-
|
69
|
+
@chest.add_to_known_words(word) if answer == 'y'
|
70
|
+
@chest.add_to_unknown_words(word) if answer == 'n'
|
66
71
|
}
|
67
72
|
|
68
73
|
puts "Done."
|
data/bin/find-examples-for
CHANGED
@@ -3,16 +3,19 @@
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'colorize'
|
5
5
|
require_relative '../lib/document-cache'
|
6
|
+
require_relative '../lib/cli'
|
7
|
+
|
8
|
+
TAU::CLI.intercept help: "Pass in a word in argument to retrieve example sentences using that word from your document cache.\n\nYou can use the switch \"-N\" to retrieve N examples."
|
6
9
|
|
7
10
|
count = 1
|
8
|
-
count_param = ARGV.find{|a| (a =~
|
11
|
+
count_param = ARGV.find{|a| (a =~ /^-\d*$/) == 0}
|
9
12
|
if !count_param.nil?
|
10
|
-
count = count_param.sub("
|
13
|
+
count = count_param.sub("-","").to_i
|
11
14
|
ARGV.reject!{|a| a == count_param}
|
12
15
|
end
|
13
16
|
|
14
17
|
search = ARGV.join(" ")
|
15
|
-
matches = DocumentCache.find_examples_for search, count
|
18
|
+
matches = DocumentCache.new.find_examples_for search, count
|
16
19
|
exit(1) if matches.empty?
|
17
20
|
|
18
21
|
puts matches.map{|sentence, tokens|
|
data/bin/frequency-list
CHANGED
@@ -1,14 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# Call with a file to list words by the frequency of their stems
|
3
|
-
# Call with no arguments to list the frequencies of the words in the vocabulary chest.
|
4
2
|
|
5
3
|
require_relative '../lib/document-cache'
|
6
4
|
require_relative '../lib/vocabulary-chest'
|
5
|
+
require_relative '../lib/cli'
|
6
|
+
|
7
|
+
TAU::CLI.intercept help: "Pass in a file to list the frequency of each word inside. Or call without argument to list the frequency for each word in your vocabulary chest based on the documents in your cache.\n\nYou can use the switch \"--unknown\" to only include unknown words (useful to figure out what to learn next)."
|
8
|
+
|
9
|
+
@cache = DocumentCache.new
|
10
|
+
@chest = VocabularyChest.new
|
7
11
|
|
8
12
|
def frequencies text=nil
|
9
|
-
text =
|
13
|
+
text = @cache.documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text} if text.nil?
|
10
14
|
|
11
|
-
frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[
|
15
|
+
frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[@chest.stem(w)] << w; hash }
|
12
16
|
|
13
17
|
frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
|
14
18
|
end
|
@@ -19,7 +23,7 @@ def output frequencies
|
|
19
23
|
end
|
20
24
|
|
21
25
|
if ARGV[0] == "--unknown"
|
22
|
-
output frequencies.find_all{|k,v|
|
26
|
+
output frequencies.find_all{|k,v| !@chest.is_known?(v[0])}
|
23
27
|
elsif ARGV.empty?
|
24
28
|
output frequencies
|
25
29
|
else
|
data/bin/percentage-known-of
CHANGED
@@ -1,12 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require_relative '../lib/vocabulary-chest'
|
4
|
+
require 'colorize'
|
5
|
+
require_relative '../lib/cli'
|
6
|
+
|
7
|
+
TAU::CLI.intercept help: "Pass in a file to calculate how much vocabulary you know from its content. Or do not pass in any argument and paste a bit of text. Use Control-D when you are done pasting."
|
8
|
+
|
9
|
+
@chest = VocabularyChest.new
|
4
10
|
|
5
11
|
def analyse text
|
6
12
|
words = text.split(" ")
|
7
|
-
known = words.select{|w|
|
13
|
+
known = words.select{|w| @chest.is_known? w}
|
8
14
|
unknown = (words - known)
|
9
|
-
return [known.map{|w|
|
15
|
+
return [known.map{|w| @chest.sanitize w}.uniq, unknown.map{|w| @chest.sanitize w}.uniq]
|
10
16
|
end
|
11
17
|
|
12
18
|
def output options
|
@@ -14,8 +20,8 @@ def output options
|
|
14
20
|
size = known.size + unknown.size
|
15
21
|
|
16
22
|
puts
|
17
|
-
|
18
|
-
puts "
|
23
|
+
puts "—".blue
|
24
|
+
puts "#{'Unknown words'.red}: #{unknown.join(", ")}"
|
19
25
|
puts
|
20
26
|
puts "Total number of unknown words: #{unknown.size}"
|
21
27
|
puts "Total number of known words: #{known.size}"
|
data/bin/play-with-examples
CHANGED
@@ -3,6 +3,12 @@
|
|
3
3
|
require_relative '../lib/document-cache'
|
4
4
|
require_relative '../lib/vocabulary-chest'
|
5
5
|
require_relative '../lib/game'
|
6
|
+
require_relative '../lib/cli'
|
7
|
+
|
8
|
+
TAU::CLI.intercept help: "This script is used to play fill-in-the-blanks using example sentences from your document cache.\n\nPass in a file with a word on each line. Or just call without arguments to paste text, one word per line. Use Control-D when you are done pasting."
|
9
|
+
|
10
|
+
|
11
|
+
@cache = DocumentCache.new
|
6
12
|
|
7
13
|
def get_input
|
8
14
|
if !ARGV.empty?
|
@@ -14,14 +20,14 @@ end
|
|
14
20
|
|
15
21
|
input = get_input
|
16
22
|
words = input.split("\n")
|
17
|
-
words.reject!{|w| STDOUT.write("."); STDOUT.flush;
|
23
|
+
words.reject!{|w| STDOUT.write("."); STDOUT.flush; @cache.find_examples_for(w).empty?}
|
18
24
|
puts
|
19
25
|
|
20
26
|
puts "Playing with #{words.size} words."
|
21
27
|
|
22
28
|
Game.new(words).play{ |word|
|
23
|
-
matches =
|
29
|
+
matches = @cache.find_examples_for(word, 10).keys
|
24
30
|
sentence = matches.sort{|a, b| a.size <=> b.size}.first
|
25
|
-
correct_answer =
|
31
|
+
correct_answer = @cache.extract_matching_words(word, sentence).first
|
26
32
|
[sentence, correct_answer]
|
27
33
|
}
|
data/bin/readability-of
CHANGED
@@ -1,31 +1,22 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require_relative '../lib/algorithms'
|
3
|
+
require_relative '../lib/cli'
|
4
|
+
|
5
|
+
TAU::CLI.intercept help: "Pass in a file to calculate how readable is the content. Or just call without argument and paste a bit of text. Use Control-D when you are done pasting."
|
2
6
|
|
3
7
|
def analyse text
|
4
8
|
words = text.split(" ").size.to_f
|
5
9
|
sentences = text.split(/\.|\?|!/).reject{|s| s.strip.empty?}.size.to_f
|
6
|
-
|
7
|
-
|
8
|
-
words_with_more_than_three_syllables = text.split(" ").select{|w| vowels(w).size >= 3}
|
10
|
+
syllable_count = text.split(" ").inject(0){|sum, w| sum + Algorithms::syllable_count(w)}
|
11
|
+
words_with_more_than_three_syllables = text.split(" ").select{|w| Algorithms::syllable_count(w) >= 3}
|
9
12
|
ms = words_with_more_than_three_syllables.size.to_f / text.split(" ").size.to_f * 100
|
10
13
|
|
11
14
|
stats = {:words => words,
|
12
15
|
:sentences => sentences,
|
13
|
-
:syllables =>
|
16
|
+
:syllables => syllable_count,
|
14
17
|
:ms => ms,
|
15
|
-
:wiener_sachtextformel => wiener_sachtextformel(sentences, words, ms),
|
16
|
-
:grade_level => grade(sentences, words,
|
17
|
-
end
|
18
|
-
|
19
|
-
def wiener_sachtextformel sentences, words, ms
|
20
|
-
0.2656 * (words / sentences) + 0.2744 * ms -1.693
|
21
|
-
end
|
22
|
-
|
23
|
-
def grade sentences, words, syllables
|
24
|
-
(0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59)
|
25
|
-
end
|
26
|
-
|
27
|
-
def vowels w
|
28
|
-
w.split(/b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|z/i).reject{|s|s.empty?}
|
18
|
+
:wiener_sachtextformel => Algorithms::wiener_sachtextformel(sentences, words, ms),
|
19
|
+
:grade_level => Algorithms::grade(sentences, words, syllable_count)}
|
29
20
|
end
|
30
21
|
|
31
22
|
def output options
|
@@ -35,11 +26,11 @@ def output options
|
|
35
26
|
end
|
36
27
|
puts "Number of sentences: #{options[:sentences]}"
|
37
28
|
puts "Number of words: #{options[:words]}"
|
38
|
-
puts "Number of
|
29
|
+
puts "Number of syllables: #{options[:syllables]}"
|
39
30
|
puts "Average number of syllables per word: #{ '%.2f' % (options[:syllables] / options[:words])}"
|
40
31
|
puts "Average number of words per sentence: #{'%.2f' % (options[:words] / options[:sentences])}"
|
41
32
|
puts "Wiener Sachtextformel: #{'%.2f' % options[:wiener_sachtextformel]}"
|
42
|
-
puts "Flesch
|
33
|
+
puts "Flesch–Kincaid Grade Level: #{'%.2f' % options[:grade_level]}"
|
43
34
|
puts
|
44
35
|
end
|
45
36
|
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/tau_config'
|
4
|
+
require_relative '../lib/cli'
|
5
|
+
|
6
|
+
help = "Pass in one argument: the language you are currently learning (fr|de|es|en|...)"
|
7
|
+
|
8
|
+
TAU::CLI.intercept help: help
|
9
|
+
|
10
|
+
(puts help; exit 1) if ARGV.empty?
|
11
|
+
|
12
|
+
TAUConfig.language = ARGV[0]
|
data/bin/vocabulary-size
CHANGED
data/lib/algorithms.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Algorithms
|
2
|
+
def self.syllable_count word # see https://stackoverflow.com/questions/1271918/ruby-count-syllables
|
3
|
+
word.downcase!
|
4
|
+
return 1 if word.length <= 3
|
5
|
+
word.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '')
|
6
|
+
word.sub!(/^y/, '')
|
7
|
+
word.scan(/[aeiouy]{1,2}/).size
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.wiener_sachtextformel sentences, words, ms
|
11
|
+
(0.2656 * (words / sentences) + 0.2744 * ms -1.693).round(2)
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.grade sentences, words, syllables
|
15
|
+
(0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59).round(2)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
data/lib/cli.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require_relative './version'
|
2
|
+
|
3
|
+
module TAU
|
4
|
+
class CLI
|
5
|
+
def self.intercept options={}
|
6
|
+
if ARGV.include? '--help'
|
7
|
+
if options.has_key? :help
|
8
|
+
puts options[:help]
|
9
|
+
exit 0
|
10
|
+
end
|
11
|
+
|
12
|
+
puts "This script is part of a set of utilities for text analysis. For more information, see https://github.com/matstc/text-analysis-utils"
|
13
|
+
exit 0
|
14
|
+
end
|
15
|
+
|
16
|
+
if ARGV.include? '--version' or ARGV.include? '-v'
|
17
|
+
puts "text-analysis-utils v#{TAU::VERSION}"
|
18
|
+
exit 0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/document-cache.rb
CHANGED
@@ -4,30 +4,34 @@ require 'uuid'
|
|
4
4
|
require_relative 'tau_config'
|
5
5
|
require_relative 'vocabulary-chest'
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
class DocumentCache
|
8
|
+
def initialize
|
9
|
+
@chest = VocabularyChest.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def add document
|
9
13
|
filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}"
|
10
14
|
File.open(filename,'w'){|f| f.write(document)}
|
11
15
|
end
|
12
16
|
|
13
|
-
def
|
14
|
-
token =
|
17
|
+
def find_matches_by_stemming search, sentences
|
18
|
+
token = @chest.stem(search)
|
15
19
|
sentences.inject({}){|hash, s|
|
16
20
|
words = s.split(" ")
|
17
|
-
found = words.select{|w|
|
18
|
-
hash[clean(s)] = found.map{|f|
|
21
|
+
found = words.select{|w| @chest.stem(w) == token}
|
22
|
+
hash[clean(s)] = found.map{|f| @chest.sanitize(f)} if !found.empty?
|
19
23
|
hash
|
20
24
|
}
|
21
25
|
end
|
22
26
|
|
23
|
-
def
|
27
|
+
def find_matches_by_grepping search, sentences
|
24
28
|
sentences.inject({}){|hash, s|
|
25
29
|
hash[clean(s)] = [search] if s.include? search
|
26
30
|
hash
|
27
31
|
}
|
28
32
|
end
|
29
33
|
|
30
|
-
def
|
34
|
+
def find_matches_in filenames, search, count
|
31
35
|
matches = {}
|
32
36
|
|
33
37
|
[:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
|
@@ -46,25 +50,29 @@ module DocumentCache
|
|
46
50
|
matches
|
47
51
|
end
|
48
52
|
|
49
|
-
def
|
53
|
+
def documents
|
50
54
|
Dir["#{TAUConfig::cache_dir}/*"]
|
51
55
|
end
|
52
56
|
|
53
|
-
|
57
|
+
def clear
|
58
|
+
documents.each {|doc| FileUtils.rm_rf doc}
|
59
|
+
end
|
60
|
+
|
61
|
+
def find_examples_for search, count=1
|
54
62
|
find_matches_in documents, search, count
|
55
63
|
end
|
56
64
|
|
57
|
-
def
|
65
|
+
def clean(sentence)
|
58
66
|
sentence.strip + "."
|
59
67
|
end
|
60
68
|
|
61
|
-
def
|
69
|
+
def extract_matching_words search, sentence
|
62
70
|
matches = find_matches_by_stemming(search, [sentence])
|
63
71
|
return matches.values.first if !matches.empty?
|
64
72
|
return find_matches_by_grepping(search, [sentence]).values.first
|
65
73
|
end
|
66
74
|
|
67
|
-
def
|
75
|
+
def frequency_list
|
68
76
|
text = ""
|
69
77
|
documents.each{|f| text += File.open(f).read }
|
70
78
|
counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
|
@@ -72,10 +80,10 @@ module DocumentCache
|
|
72
80
|
counts.sort_by {|k,v| v}.reverse
|
73
81
|
end
|
74
82
|
|
75
|
-
def
|
83
|
+
def stemmed_frequency_list
|
76
84
|
text = ""
|
77
85
|
documents.each{|f| text += File.open(f).read }
|
78
|
-
stems = text.split(" ").map{|w|
|
86
|
+
stems = text.split(" ").map{|w| @chest.stem(w)}
|
79
87
|
counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
|
80
88
|
counts.reject!{|stem, count| count < 2}
|
81
89
|
counts.sort_by {|k,v| v}.reverse
|
data/lib/game.rb
CHANGED
data/lib/tau_config.rb
CHANGED
@@ -1,22 +1,42 @@
|
|
1
1
|
require 'fileutils.rb'
|
2
2
|
|
3
3
|
module TAUConfig
|
4
|
+
def self.language
|
5
|
+
language = File.open(language_file,'r') { |f| f.read }
|
6
|
+
return language unless language.empty?
|
7
|
+
|
8
|
+
ENV['vocabulary_chest_language'] || "en"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.language= language
|
12
|
+
File.open(language_file,'w') do |f|
|
13
|
+
f.write language
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
4
17
|
def self.root_dir
|
5
18
|
File.expand_path(ENV['vocabulary_chest_location'] || "~/.vocabulary-chest")
|
6
19
|
end
|
20
|
+
|
7
21
|
def self.known_file
|
8
22
|
"#{root_dir}/known"
|
9
23
|
end
|
24
|
+
|
10
25
|
def self.unknown_file
|
11
26
|
"#{root_dir}/unknown"
|
12
27
|
end
|
28
|
+
|
13
29
|
def self.cache_dir
|
14
30
|
"#{root_dir}/docs"
|
15
31
|
end
|
16
32
|
|
33
|
+
def self.language_file
|
34
|
+
"#{root_dir}/language"
|
35
|
+
end
|
17
36
|
end
|
18
37
|
|
19
38
|
FileUtils::mkdir_p TAUConfig.root_dir
|
39
|
+
FileUtils::touch TAUConfig.language_file
|
20
40
|
FileUtils::touch TAUConfig.known_file
|
21
41
|
FileUtils::touch TAUConfig.unknown_file
|
22
42
|
FileUtils::mkdir_p TAUConfig.root_dir
|
data/lib/version.rb
ADDED
data/lib/vocabulary-chest.rb
CHANGED
@@ -5,43 +5,45 @@ require 'lingua/stemmer'
|
|
5
5
|
|
6
6
|
require_relative 'tau_config'
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
class VocabularyChest
|
9
|
+
def initialize
|
10
|
+
@known_file = File.open(TAUConfig.known_file,'a')
|
11
|
+
@unknown_file = File.open(TAUConfig.unknown_file,'a')
|
12
|
+
@stemmer= Lingua::Stemmer.new(:language => TAUConfig.language)
|
13
|
+
end
|
14
|
+
|
15
|
+
def known_words
|
14
16
|
File.open(@known_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
|
15
17
|
end
|
16
18
|
|
17
|
-
def
|
19
|
+
def unknown_words
|
18
20
|
File.open(@unknown_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
|
19
21
|
end
|
20
22
|
|
21
|
-
def
|
23
|
+
def add_to_known_words word
|
22
24
|
@known_file.puts(stem word)
|
23
25
|
@known_file.flush
|
24
26
|
end
|
25
27
|
|
26
|
-
def
|
28
|
+
def add_to_unknown_words word
|
27
29
|
@unknown_file.puts(stem word)
|
28
30
|
@unknown_file.flush
|
29
31
|
end
|
30
32
|
|
31
|
-
def
|
33
|
+
def contains? word
|
32
34
|
stemmed_word = stem word
|
33
35
|
known_words.include?(stemmed_word) or unknown_words.include?(stemmed_word)
|
34
36
|
end
|
35
37
|
|
36
|
-
def
|
38
|
+
def is_known? word
|
37
39
|
known_words.include?(stem(word)) or sanitize(word).empty? or (sanitize(word) =~ /^[-\d]*$/) != nil
|
38
40
|
end
|
39
41
|
|
40
|
-
def
|
42
|
+
def stem word
|
41
43
|
@stemmer.stem(sanitize word).downcase
|
42
44
|
end
|
43
45
|
|
44
|
-
def
|
46
|
+
def sanitize word
|
45
47
|
word.gsub(/[,\"\.:;()?!„“]/,"")
|
46
48
|
end
|
47
49
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text-analysis-utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- '@matstc'
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colorize
|
@@ -66,10 +66,10 @@ dependencies:
|
|
66
66
|
- - '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
description: Utilities to help language learners
|
69
|
+
description: Utilities to help language learners acquire vocabulary
|
70
70
|
email:
|
71
71
|
executables:
|
72
|
-
-
|
72
|
+
- set-text-language
|
73
73
|
- percentage-known-of
|
74
74
|
- readability-of
|
75
75
|
- vocabulary-size
|
@@ -83,10 +83,13 @@ extra_rdoc_files: []
|
|
83
83
|
files:
|
84
84
|
- lib/game.rb
|
85
85
|
- lib/document-cache.rb
|
86
|
+
- lib/cli.rb
|
86
87
|
- lib/tau_config.rb
|
88
|
+
- lib/algorithms.rb
|
87
89
|
- lib/text-analysis-utils.rb
|
90
|
+
- lib/version.rb
|
88
91
|
- lib/vocabulary-chest.rb
|
89
|
-
- bin/
|
92
|
+
- bin/set-text-language
|
90
93
|
- bin/percentage-known-of
|
91
94
|
- bin/readability-of
|
92
95
|
- bin/vocabulary-size
|
@@ -118,6 +121,6 @@ rubyforge_project:
|
|
118
121
|
rubygems_version: 2.1.11
|
119
122
|
signing_key:
|
120
123
|
specification_version: 4
|
121
|
-
summary: Utilities to help language learners
|
124
|
+
summary: Utilities to help language learners acquire vocabulary
|
122
125
|
test_files: []
|
123
126
|
has_rdoc:
|
data/bin/proximity-of-words
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'amatch'
|
4
|
-
|
5
|
-
def distance w1, w2
|
6
|
-
Amatch::Levenshtein.new(w1).match(w2)
|
7
|
-
end
|
8
|
-
|
9
|
-
def analyse text, known_text
|
10
|
-
words = words_of(text)
|
11
|
-
known_words = words_of(known_text)
|
12
|
-
|
13
|
-
words.map {|w|
|
14
|
-
closest_word, proximity = find_closest_word(w, known_words)
|
15
|
-
puts "#{w}\t#{closest_word}\t#{proximity}"
|
16
|
-
STDOUT.flush
|
17
|
-
}
|
18
|
-
end
|
19
|
-
|
20
|
-
def find_closest_word word, known_words
|
21
|
-
closest_word = known_words.sort{|w1, w2| distance(w1, word) <=> distance(w2, word)}.first
|
22
|
-
|
23
|
-
[closest_word, distance(closest_word, word)]
|
24
|
-
end
|
25
|
-
|
26
|
-
def words_of text
|
27
|
-
words = text.split(" ").uniq
|
28
|
-
end
|
29
|
-
|
30
|
-
if ARGV.size < 2
|
31
|
-
puts "usage: #{$0} file_with_new_words file_with_known_words"
|
32
|
-
exit 1
|
33
|
-
end
|
34
|
-
|
35
|
-
filename = ARGV.shift
|
36
|
-
text = File.open(filename,'r'){|file| file.read}
|
37
|
-
|
38
|
-
known_text ||= ""
|
39
|
-
ARGV.each {|filename|
|
40
|
-
known_text += File.open(filename,'r'){|file| file.read}
|
41
|
-
}
|
42
|
-
|
43
|
-
analyse text, known_text
|