text-analysis-utils 0.5.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d7f8baa8dddefcdf0e0cb1897a714eef20cecdc9
4
- data.tar.gz: 76e10901cb1e270ca4f1f95d5cee3425b06a1656
3
+ metadata.gz: 8aa34da03ff05968e5a80084e9202357dccef15d
4
+ data.tar.gz: a84652578851e4a52b8e7ba3cd77ad948f9e6681
5
5
  SHA512:
6
- metadata.gz: 616397557e53efe884f01f59900221fb93a97dba1ff0ab0769882d1b7bc5b3d0f6482896d9045ce2986a53321e3b832726c2703ebd107ce39d6d93fa1cfa85b2
7
- data.tar.gz: 739aa1e163b08bd5869a1b29887daac6d7334fdcdcd122062b3a7d57c8fb10679fcf9ec208e9be1075834f331e40637bee81121fc7efc2c93f5f2c911e602f1d
6
+ metadata.gz: f37a401ef683f0ece40434a943cafecc6502a7ddef853adaea0f4fe3af90beb643d04c1f3c926301c97757b95d20b546a0c9c7e371748563ca1d0b4033de393a
7
+ data.tar.gz: 881ccba2a66db59b3c8d21595398dcd9dadb2d4e249268aaf202c040578d5d8b0be36d15ce7768cfa7e3b0d24bf81a5e153728822c5dd9c7a286ef668d394456
data/bin/cache-document CHANGED
@@ -1,7 +1,10 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require_relative '../lib/cli'
3
4
  require_relative '../lib/document-cache'
4
5
 
6
+ TAU::CLI.intercept help: "Pass in file names in argument to cache their contents. Or call this script without arguments to paste text. Use Control-D when you are done pasting."
7
+
5
8
  def get_text
6
9
  if ARGV.empty?
7
10
  STDIN.read
@@ -12,4 +15,4 @@ def get_text
12
15
  end
13
16
  end
14
17
 
15
- DocumentCache.add(get_text)
18
+ DocumentCache.new.add(get_text)
@@ -4,7 +4,9 @@ require 'rubygems'
4
4
  require 'colorize'
5
5
  require_relative '../lib/vocabulary-chest'
6
6
  require_relative '../lib/document-cache'
7
+ require_relative '../lib/cli'
7
8
 
9
+ TAU::CLI.intercept help: "Pass in file names in argument to classify their content. Or call this script without arguments to paste words to classify. Use Control-D when you are done pasting.\n\nYou can use the switch \"-n\" to skip any word that is already in your vocabulary chest."
8
10
 
9
11
  def get_text
10
12
  if !ARGV.empty?
@@ -18,6 +20,9 @@ def get_text
18
20
  end
19
21
  end
20
22
 
23
+ @cache = DocumentCache.new
24
+ @chest = VocabularyChest.new
25
+
21
26
  def ask word, index, words, text
22
27
  location = (text =~ /\b#{Regexp.escape(word)}\b/)
23
28
  location = text.index(word) if location.nil?
@@ -41,17 +46,17 @@ def ask word, index, words, text
41
46
  end
42
47
 
43
48
  def collect_words_from textual_words
44
- textual_words.reject!{|w|VocabularyChest::is_known?(w)}
45
- textual_words.reject!{|w|VocabularyChest::contains?(w)} if @@options.include? "-n"
49
+ textual_words.reject!{|w|@chest.is_known?(w)}
50
+ textual_words.reject!{|w|@chest.contains?(w)} if @options.include? "-n"
46
51
 
47
- words_by_stem = textual_words.inject({}){|hash, w| hash[VocabularyChest::stem w] = w; hash}
52
+ words_by_stem = textual_words.inject({}){|hash, w| hash[@chest.stem w] = w; hash}
48
53
  words_by_stem.reject!{|s,w| (s =~ /^[-\d\s,\.]*$/) == 0}
49
54
 
50
55
  words_by_stem.values.uniq
51
56
  end
52
57
 
53
- @@options = ARGV.select{|arg| ["-n"].include? arg}
54
- ARGV.reject!{|arg| @@options.include? arg}
58
+ @options = ARGV.select{|arg| ["-n"].include? arg}
59
+ ARGV.reject!{|arg| @options.include? arg}
55
60
 
56
61
  text = get_text
57
62
  textual_words = text.split(" ").collect{|w| w.chomp}
@@ -59,10 +64,10 @@ puts "Thanks. Please wait..."
59
64
 
60
65
  words = collect_words_from textual_words
61
66
  words.each_with_index {|word, index|
62
- match = DocumentCache::extract_matching_words(word, text).first
67
+ match = @cache.extract_matching_words(word, text).first
63
68
  answer = ask match, index, words, text
64
- VocabularyChest::add_to_known_words(word) if answer == 'y'
65
- VocabularyChest::add_to_unknown_words(word) if answer == 'n'
69
+ @chest.add_to_known_words(word) if answer == 'y'
70
+ @chest.add_to_unknown_words(word) if answer == 'n'
66
71
  }
67
72
 
68
73
  puts "Done."
@@ -3,16 +3,19 @@
3
3
  require 'rubygems'
4
4
  require 'colorize'
5
5
  require_relative '../lib/document-cache'
6
+ require_relative '../lib/cli'
7
+
8
+ TAU::CLI.intercept help: "Pass in a word in argument to retrieve example sentences using that word from your document cache.\n\nYou can use the switch \"-N\" to retrieve N examples."
6
9
 
7
10
  count = 1
8
- count_param = ARGV.find{|a| (a =~ /--\d*/) == 0}
11
+ count_param = ARGV.find{|a| (a =~ /^-\d*$/) == 0}
9
12
  if !count_param.nil?
10
- count = count_param.sub("--","").to_i
13
+ count = count_param.sub("-","").to_i
11
14
  ARGV.reject!{|a| a == count_param}
12
15
  end
13
16
 
14
17
  search = ARGV.join(" ")
15
- matches = DocumentCache.find_examples_for search, count
18
+ matches = DocumentCache.new.find_examples_for search, count
16
19
  exit(1) if matches.empty?
17
20
 
18
21
  puts matches.map{|sentence, tokens|
data/bin/frequency-list CHANGED
@@ -1,14 +1,18 @@
1
1
  #!/usr/bin/env ruby
2
- # Call with a file to list words by the frequency of their stems
3
- # Call with no arguments to list the frequencies of the words in the vocabulary chest.
4
2
 
5
3
  require_relative '../lib/document-cache'
6
4
  require_relative '../lib/vocabulary-chest'
5
+ require_relative '../lib/cli'
6
+
7
+ TAU::CLI.intercept help: "Pass in a file to list the frequency of each word inside. Or call without argument to list the frequency for each word in your vocabulary chest based on the documents in your cache.\n\nYou can use the switch \"--unknown\" to only include unknown words (useful to figure out what to learn next)."
8
+
9
+ @cache = DocumentCache.new
10
+ @chest = VocabularyChest.new
7
11
 
8
12
  def frequencies text=nil
9
- text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text} if text.nil?
13
+ text = @cache.documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text} if text.nil?
10
14
 
11
- frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
15
+ frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[@chest.stem(w)] << w; hash }
12
16
 
13
17
  frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
14
18
  end
@@ -19,7 +23,7 @@ def output frequencies
19
23
  end
20
24
 
21
25
  if ARGV[0] == "--unknown"
22
- output frequencies.find_all{|k,v| !VocabularyChest::is_known?(v[0])}
26
+ output frequencies.find_all{|k,v| !@chest.is_known?(v[0])}
23
27
  elsif ARGV.empty?
24
28
  output frequencies
25
29
  else
@@ -1,12 +1,18 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative '../lib/vocabulary-chest'
4
+ require 'colorize'
5
+ require_relative '../lib/cli'
6
+
7
+ TAU::CLI.intercept help: "Pass in a file to calculate how much vocabulary you know from its content. Or do not pass in any argument and paste a bit of text. Use Control-D when you are done pasting."
8
+
9
+ @chest = VocabularyChest.new
4
10
 
5
11
  def analyse text
6
12
  words = text.split(" ")
7
- known = words.select{|w| VocabularyChest.is_known? w}
13
+ known = words.select{|w| @chest.is_known? w}
8
14
  unknown = (words - known)
9
- return [known.map{|w| VocabularyChest::sanitize w}.uniq, unknown.map{|w| VocabularyChest::sanitize w}.uniq]
15
+ return [known.map{|w| @chest.sanitize w}.uniq, unknown.map{|w| @chest.sanitize w}.uniq]
10
16
  end
11
17
 
12
18
  def output options
@@ -14,8 +20,8 @@ def output options
14
20
  size = known.size + unknown.size
15
21
 
16
22
  puts
17
- puts "--"
18
- puts "UNKNOWN WORDS: #{unknown.join(", ")}"
23
+ puts "".blue
24
+ puts "#{'Unknown words'.red}: #{unknown.join(", ")}"
19
25
  puts
20
26
  puts "Total number of unknown words: #{unknown.size}"
21
27
  puts "Total number of known words: #{known.size}"
@@ -3,6 +3,12 @@
3
3
  require_relative '../lib/document-cache'
4
4
  require_relative '../lib/vocabulary-chest'
5
5
  require_relative '../lib/game'
6
+ require_relative '../lib/cli'
7
+
8
+ TAU::CLI.intercept help: "This script is used to play fill-in-the-blanks using example sentences from your document cache.\n\nPass in a file with a word on each line. Or just call without arguments to paste text, one word per line. Use Control-D when you are done pasting."
9
+
10
+
11
+ @cache = DocumentCache.new
6
12
 
7
13
  def get_input
8
14
  if !ARGV.empty?
@@ -14,14 +20,14 @@ end
14
20
 
15
21
  input = get_input
16
22
  words = input.split("\n")
17
- words.reject!{|w| STDOUT.write("."); STDOUT.flush; DocumentCache.find_examples_for(w).empty?}
23
+ words.reject!{|w| STDOUT.write("."); STDOUT.flush; @cache.find_examples_for(w).empty?}
18
24
  puts
19
25
 
20
26
  puts "Playing with #{words.size} words."
21
27
 
22
28
  Game.new(words).play{ |word|
23
- matches = DocumentCache.find_examples_for(word, 10).keys
29
+ matches = @cache.find_examples_for(word, 10).keys
24
30
  sentence = matches.sort{|a, b| a.size <=> b.size}.first
25
- correct_answer = DocumentCache::extract_matching_words(word, sentence).first
31
+ correct_answer = @cache.extract_matching_words(word, sentence).first
26
32
  [sentence, correct_answer]
27
33
  }
data/bin/readability-of CHANGED
@@ -1,31 +1,22 @@
1
1
  #!/usr/bin/env ruby
2
+ require_relative '../lib/algorithms'
3
+ require_relative '../lib/cli'
4
+
5
+ TAU::CLI.intercept help: "Pass in a file to calculate how readable is the content. Or just call without argument and paste a bit of text. Use Control-D when you are done pasting."
2
6
 
3
7
  def analyse text
4
8
  words = text.split(" ").size.to_f
5
9
  sentences = text.split(/\.|\?|!/).reject{|s| s.strip.empty?}.size.to_f
6
- syllables = text.split(" ").inject([]){|sum, w| sum + vowels(w)}
7
- syllables = syllables.size.to_f * 0.9 # for silent vowels
8
- words_with_more_than_three_syllables = text.split(" ").select{|w| vowels(w).size >= 3}
10
+ syllable_count = text.split(" ").inject(0){|sum, w| sum + Algorithms::syllable_count(w)}
11
+ words_with_more_than_three_syllables = text.split(" ").select{|w| Algorithms::syllable_count(w) >= 3}
9
12
  ms = words_with_more_than_three_syllables.size.to_f / text.split(" ").size.to_f * 100
10
13
 
11
14
  stats = {:words => words,
12
15
  :sentences => sentences,
13
- :syllables => syllables,
16
+ :syllables => syllable_count,
14
17
  :ms => ms,
15
- :wiener_sachtextformel => wiener_sachtextformel(sentences, words, ms),
16
- :grade_level => grade(sentences, words, syllables)}
17
- end
18
-
19
- def wiener_sachtextformel sentences, words, ms
20
- 0.2656 * (words / sentences) + 0.2744 * ms -1.693
21
- end
22
-
23
- def grade sentences, words, syllables
24
- (0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59)
25
- end
26
-
27
- def vowels w
28
- w.split(/b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|z/i).reject{|s|s.empty?}
18
+ :wiener_sachtextformel => Algorithms::wiener_sachtextformel(sentences, words, ms),
19
+ :grade_level => Algorithms::grade(sentences, words, syllable_count)}
29
20
  end
30
21
 
31
22
  def output options
@@ -35,11 +26,11 @@ def output options
35
26
  end
36
27
  puts "Number of sentences: #{options[:sentences]}"
37
28
  puts "Number of words: #{options[:words]}"
38
- puts "Number of syllabes: #{options[:syllables]}"
29
+ puts "Number of syllables: #{options[:syllables]}"
39
30
  puts "Average number of syllables per word: #{ '%.2f' % (options[:syllables] / options[:words])}"
40
31
  puts "Average number of words per sentence: #{'%.2f' % (options[:words] / options[:sentences])}"
41
32
  puts "Wiener Sachtextformel: #{'%.2f' % options[:wiener_sachtextformel]}"
42
- puts "Flesch-Kincaid Grade Level: #{'%.2f' % options[:grade_level]}"
33
+ puts "FleschKincaid Grade Level: #{'%.2f' % options[:grade_level]}"
43
34
  puts
44
35
  end
45
36
 
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/tau_config'
4
+ require_relative '../lib/cli'
5
+
6
+ help = "Pass in one argument: the language you are currently learning (fr|de|es|en|...)"
7
+
8
+ TAU::CLI.intercept help: help
9
+
10
+ (puts help; exit 1) if ARGV.empty?
11
+
12
+ TAUConfig.language = ARGV[0]
data/bin/vocabulary-size CHANGED
@@ -1,6 +1,10 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require_relative '../lib/vocabulary-chest'
4
+ require_relative '../lib/cli'
5
+
6
+ TAU::CLI.intercept
7
+
8
+ size = VocabularyChest.new.known_words.size
4
9
 
5
- size = VocabularyChest.known_words.size
6
10
  puts "You know #{size} words."
data/lib/algorithms.rb ADDED
@@ -0,0 +1,18 @@
1
+ module Algorithms
2
+ def self.syllable_count word # see https://stackoverflow.com/questions/1271918/ruby-count-syllables
3
+ word.downcase!
4
+ return 1 if word.length <= 3
5
+ word.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '')
6
+ word.sub!(/^y/, '')
7
+ word.scan(/[aeiouy]{1,2}/).size
8
+ end
9
+
10
+ def self.wiener_sachtextformel sentences, words, ms
11
+ (0.2656 * (words / sentences) + 0.2744 * ms -1.693).round(2)
12
+ end
13
+
14
+ def self.grade sentences, words, syllables
15
+ (0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59).round(2)
16
+ end
17
+ end
18
+
data/lib/cli.rb ADDED
@@ -0,0 +1,22 @@
1
+ require_relative './version'
2
+
3
+ module TAU
4
+ class CLI
5
+ def self.intercept options={}
6
+ if ARGV.include? '--help'
7
+ if options.has_key? :help
8
+ puts options[:help]
9
+ exit 0
10
+ end
11
+
12
+ puts "This script is part of a set of utilities for text analysis. For more information, see https://github.com/matstc/text-analysis-utils"
13
+ exit 0
14
+ end
15
+
16
+ if ARGV.include? '--version' or ARGV.include? '-v'
17
+ puts "text-analysis-utils v#{TAU::VERSION}"
18
+ exit 0
19
+ end
20
+ end
21
+ end
22
+ end
@@ -4,30 +4,34 @@ require 'uuid'
4
4
  require_relative 'tau_config'
5
5
  require_relative 'vocabulary-chest'
6
6
 
7
- module DocumentCache
8
- def self.add document
7
+ class DocumentCache
8
+ def initialize
9
+ @chest = VocabularyChest.new
10
+ end
11
+
12
+ def add document
9
13
  filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}"
10
14
  File.open(filename,'w'){|f| f.write(document)}
11
15
  end
12
16
 
13
- def self.find_matches_by_stemming search, sentences
14
- token = VocabularyChest::stem(search)
17
+ def find_matches_by_stemming search, sentences
18
+ token = @chest.stem(search)
15
19
  sentences.inject({}){|hash, s|
16
20
  words = s.split(" ")
17
- found = words.select{|w| VocabularyChest::stem(w) == token}
18
- hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty?
21
+ found = words.select{|w| @chest.stem(w) == token}
22
+ hash[clean(s)] = found.map{|f| @chest.sanitize(f)} if !found.empty?
19
23
  hash
20
24
  }
21
25
  end
22
26
 
23
- def self.find_matches_by_grepping search, sentences
27
+ def find_matches_by_grepping search, sentences
24
28
  sentences.inject({}){|hash, s|
25
29
  hash[clean(s)] = [search] if s.include? search
26
30
  hash
27
31
  }
28
32
  end
29
33
 
30
- def self.find_matches_in filenames, search, count
34
+ def find_matches_in filenames, search, count
31
35
  matches = {}
32
36
 
33
37
  [:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
@@ -46,25 +50,29 @@ module DocumentCache
46
50
  matches
47
51
  end
48
52
 
49
- def self.documents
53
+ def documents
50
54
  Dir["#{TAUConfig::cache_dir}/*"]
51
55
  end
52
56
 
53
- def self.find_examples_for search, count=1
57
+ def clear
58
+ documents.each {|doc| FileUtils.rm_rf doc}
59
+ end
60
+
61
+ def find_examples_for search, count=1
54
62
  find_matches_in documents, search, count
55
63
  end
56
64
 
57
- def self.clean(sentence)
65
+ def clean(sentence)
58
66
  sentence.strip + "."
59
67
  end
60
68
 
61
- def self.extract_matching_words search, sentence
69
+ def extract_matching_words search, sentence
62
70
  matches = find_matches_by_stemming(search, [sentence])
63
71
  return matches.values.first if !matches.empty?
64
72
  return find_matches_by_grepping(search, [sentence]).values.first
65
73
  end
66
74
 
67
- def self.frequency_list
75
+ def frequency_list
68
76
  text = ""
69
77
  documents.each{|f| text += File.open(f).read }
70
78
  counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
@@ -72,10 +80,10 @@ module DocumentCache
72
80
  counts.sort_by {|k,v| v}.reverse
73
81
  end
74
82
 
75
- def self.stemmed_frequency_list
83
+ def stemmed_frequency_list
76
84
  text = ""
77
85
  documents.each{|f| text += File.open(f).read }
78
- stems = text.split(" ").map{|w| VocabularyChest::stem(w)}
86
+ stems = text.split(" ").map{|w| @chest.stem(w)}
79
87
  counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
80
88
  counts.reject!{|stem, count| count < 2}
81
89
  counts.sort_by {|k,v| v}.reverse
data/lib/game.rb CHANGED
@@ -105,7 +105,7 @@ class Game
105
105
  end
106
106
 
107
107
  def play &block
108
- (puts "Could not find any words to play with."; exit 1) if @words.empty?
108
+ (puts "Could not find any words to play with."; return) if @words.empty?
109
109
 
110
110
  @words.shuffle.each{|word|
111
111
  @turn += 1
data/lib/tau_config.rb CHANGED
@@ -1,22 +1,42 @@
1
1
  require 'fileutils.rb'
2
2
 
3
3
  module TAUConfig
4
+ def self.language
5
+ language = File.open(language_file,'r') { |f| f.read }
6
+ return language unless language.empty?
7
+
8
+ ENV['vocabulary_chest_language'] || "en"
9
+ end
10
+
11
+ def self.language= language
12
+ File.open(language_file,'w') do |f|
13
+ f.write language
14
+ end
15
+ end
16
+
4
17
  def self.root_dir
5
18
  File.expand_path(ENV['vocabulary_chest_location'] || "~/.vocabulary-chest")
6
19
  end
20
+
7
21
  def self.known_file
8
22
  "#{root_dir}/known"
9
23
  end
24
+
10
25
  def self.unknown_file
11
26
  "#{root_dir}/unknown"
12
27
  end
28
+
13
29
  def self.cache_dir
14
30
  "#{root_dir}/docs"
15
31
  end
16
32
 
33
+ def self.language_file
34
+ "#{root_dir}/language"
35
+ end
17
36
  end
18
37
 
19
38
  FileUtils::mkdir_p TAUConfig.root_dir
39
+ FileUtils::touch TAUConfig.language_file
20
40
  FileUtils::touch TAUConfig.known_file
21
41
  FileUtils::touch TAUConfig.unknown_file
22
42
  FileUtils::mkdir_p TAUConfig.root_dir
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module TAU
2
+ VERSION = '0.7.0'
3
+ end
@@ -5,43 +5,45 @@ require 'lingua/stemmer'
5
5
 
6
6
  require_relative 'tau_config'
7
7
 
8
- module VocabularyChest
9
- @known_file = File.open(TAUConfig.known_file,'a')
10
- @unknown_file = File.open(TAUConfig.unknown_file,'a')
11
- @stemmer= Lingua::Stemmer.new(:language => ENV['vocabulary_chest_language'] || "en")
12
-
13
- def self.known_words
8
+ class VocabularyChest
9
+ def initialize
10
+ @known_file = File.open(TAUConfig.known_file,'a')
11
+ @unknown_file = File.open(TAUConfig.unknown_file,'a')
12
+ @stemmer= Lingua::Stemmer.new(:language => TAUConfig.language)
13
+ end
14
+
15
+ def known_words
14
16
  File.open(@known_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
15
17
  end
16
18
 
17
- def self.unknown_words
19
+ def unknown_words
18
20
  File.open(@unknown_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
19
21
  end
20
22
 
21
- def self.add_to_known_words word
23
+ def add_to_known_words word
22
24
  @known_file.puts(stem word)
23
25
  @known_file.flush
24
26
  end
25
27
 
26
- def self.add_to_unknown_words word
28
+ def add_to_unknown_words word
27
29
  @unknown_file.puts(stem word)
28
30
  @unknown_file.flush
29
31
  end
30
32
 
31
- def self.contains? word
33
+ def contains? word
32
34
  stemmed_word = stem word
33
35
  known_words.include?(stemmed_word) or unknown_words.include?(stemmed_word)
34
36
  end
35
37
 
36
- def self.is_known? word
38
+ def is_known? word
37
39
  known_words.include?(stem(word)) or sanitize(word).empty? or (sanitize(word) =~ /^[-\d]*$/) != nil
38
40
  end
39
41
 
40
- def self.stem word
42
+ def stem word
41
43
  @stemmer.stem(sanitize word).downcase
42
44
  end
43
45
 
44
- def self.sanitize word
46
+ def sanitize word
45
47
  word.gsub(/[,\"\.:;()?!„“]/,"")
46
48
  end
47
49
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text-analysis-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - '@matstc'
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-26 00:00:00.000000000 Z
11
+ date: 2014-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: colorize
@@ -66,10 +66,10 @@ dependencies:
66
66
  - - '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- description: Utilities to help language learners
69
+ description: Utilities to help language learners acquire vocabulary
70
70
  email:
71
71
  executables:
72
- - proximity-of-words
72
+ - set-text-language
73
73
  - percentage-known-of
74
74
  - readability-of
75
75
  - vocabulary-size
@@ -83,10 +83,13 @@ extra_rdoc_files: []
83
83
  files:
84
84
  - lib/game.rb
85
85
  - lib/document-cache.rb
86
+ - lib/cli.rb
86
87
  - lib/tau_config.rb
88
+ - lib/algorithms.rb
87
89
  - lib/text-analysis-utils.rb
90
+ - lib/version.rb
88
91
  - lib/vocabulary-chest.rb
89
- - bin/proximity-of-words
92
+ - bin/set-text-language
90
93
  - bin/percentage-known-of
91
94
  - bin/readability-of
92
95
  - bin/vocabulary-size
@@ -118,6 +121,6 @@ rubyforge_project:
118
121
  rubygems_version: 2.1.11
119
122
  signing_key:
120
123
  specification_version: 4
121
- summary: Utilities to help language learners
124
+ summary: Utilities to help language learners acquire vocabulary
122
125
  test_files: []
123
126
  has_rdoc:
@@ -1,43 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'amatch'
4
-
5
- def distance w1, w2
6
- Amatch::Levenshtein.new(w1).match(w2)
7
- end
8
-
9
- def analyse text, known_text
10
- words = words_of(text)
11
- known_words = words_of(known_text)
12
-
13
- words.map {|w|
14
- closest_word, proximity = find_closest_word(w, known_words)
15
- puts "#{w}\t#{closest_word}\t#{proximity}"
16
- STDOUT.flush
17
- }
18
- end
19
-
20
- def find_closest_word word, known_words
21
- closest_word = known_words.sort{|w1, w2| distance(w1, word) <=> distance(w2, word)}.first
22
-
23
- [closest_word, distance(closest_word, word)]
24
- end
25
-
26
- def words_of text
27
- words = text.split(" ").uniq
28
- end
29
-
30
- if ARGV.size < 2
31
- puts "usage: #{$0} file_with_new_words file_with_known_words"
32
- exit 1
33
- end
34
-
35
- filename = ARGV.shift
36
- text = File.open(filename,'r'){|file| file.read}
37
-
38
- known_text ||= ""
39
- ARGV.each {|filename|
40
- known_text += File.open(filename,'r'){|file| file.read}
41
- }
42
-
43
- analyse text, known_text