RubyGems - text-analysis-utils - Versions diffs - 0.5.0 → 0.7.0 - Mend

text-analysis-utils 0.5.0 → 0.7.0

Files changed (19) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d7f8baa8dddefcdf0e0cb1897a714eef20cecdc9
-  data.tar.gz: 76e10901cb1e270ca4f1f95d5cee3425b06a1656
+  metadata.gz: 8aa34da03ff05968e5a80084e9202357dccef15d
+  data.tar.gz: a84652578851e4a52b8e7ba3cd77ad948f9e6681
 SHA512:
-  metadata.gz: 616397557e53efe884f01f59900221fb93a97dba1ff0ab0769882d1b7bc5b3d0f6482896d9045ce2986a53321e3b832726c2703ebd107ce39d6d93fa1cfa85b2
-  data.tar.gz: 739aa1e163b08bd5869a1b29887daac6d7334fdcdcd122062b3a7d57c8fb10679fcf9ec208e9be1075834f331e40637bee81121fc7efc2c93f5f2c911e602f1d
+  metadata.gz: f37a401ef683f0ece40434a943cafecc6502a7ddef853adaea0f4fe3af90beb643d04c1f3c926301c97757b95d20b546a0c9c7e371748563ca1d0b4033de393a
+  data.tar.gz: 881ccba2a66db59b3c8d21595398dcd9dadb2d4e249268aaf202c040578d5d8b0be36d15ce7768cfa7e3b0d24bf81a5e153728822c5dd9c7a286ef668d394456

data/bin/cache-document CHANGED Viewed

@@ -1,7 +1,10 @@
 #!/usr/bin/env ruby
+require_relative '../lib/cli'
 require_relative '../lib/document-cache'
+TAU::CLI.intercept help: "Pass in file names in argument to cache their contents. Or call this script without arguments to paste text. Use Control-D when you are done pasting."
 def get_text
 	if ARGV.empty?
 		STDIN.read
@@ -12,4 +15,4 @@ def get_text
 	end
 end
-DocumentCache.add(get_text)
+DocumentCache.new.add(get_text)

data/bin/classify-new-words CHANGED Viewed

@@ -4,7 +4,9 @@ require 'rubygems'
 require 'colorize'
 require_relative '../lib/vocabulary-chest'
 require_relative '../lib/document-cache'
+require_relative '../lib/cli'
+TAU::CLI.intercept help: "Pass in file names in argument to classify their content. Or call this script without arguments to paste words to classify. Use Control-D when you are done pasting.\n\nYou can use the switch \"-n\" to skip any word that is already in your vocabulary chest."
 def get_text
 	if !ARGV.empty?
@@ -18,6 +20,9 @@ def get_text
 	end
 end
+@cache = DocumentCache.new
+@chest = VocabularyChest.new
 def ask word, index, words, text
   location = (text =~ /\b#{Regexp.escape(word)}\b/)
   location = text.index(word) if location.nil?
@@ -41,17 +46,17 @@ def ask word, index, words, text
 end
 def collect_words_from textual_words
-	textual_words.reject!{|w|VocabularyChest::is_known?(w)}
-	textual_words.reject!{|w|VocabularyChest::contains?(w)} if @@options.include? "-n"
+	textual_words.reject!{|w|@chest.is_known?(w)}
+	textual_words.reject!{|w|@chest.contains?(w)} if @options.include? "-n"
-	words_by_stem = textual_words.inject({}){|hash, w| hash[VocabularyChest::stem w] = w; hash}
+	words_by_stem = textual_words.inject({}){|hash, w| hash[@chest.stem w] = w; hash}
 	words_by_stem.reject!{|s,w| (s =~ /^[-\d\s,\.]*$/) == 0}
 	words_by_stem.values.uniq
 end
-@@options = ARGV.select{|arg| ["-n"].include? arg}
-ARGV.reject!{|arg| @@options.include? arg}
+@options = ARGV.select{|arg| ["-n"].include? arg}
+ARGV.reject!{|arg| @options.include? arg}
 text = get_text
 textual_words = text.split(" ").collect{|w| w.chomp}
@@ -59,10 +64,10 @@ puts "Thanks. Please wait..."
 words = collect_words_from textual_words
 words.each_with_index {|word, index|
-  match = DocumentCache::extract_matching_words(word, text).first
+  match = @cache.extract_matching_words(word, text).first
   answer = ask match, index, words, text
-  VocabularyChest::add_to_known_words(word) if answer == 'y'
-  VocabularyChest::add_to_unknown_words(word) if answer == 'n'
+  @chest.add_to_known_words(word) if answer == 'y'
+  @chest.add_to_unknown_words(word) if answer == 'n'
 }
 puts "Done."

data/bin/find-examples-for CHANGED Viewed

@@ -3,16 +3,19 @@
 require 'rubygems'
 require 'colorize'
 require_relative '../lib/document-cache'
+require_relative '../lib/cli'
+TAU::CLI.intercept help: "Pass in a word in argument to retrieve example sentences using that word from your document cache.\n\nYou can use the switch \"-N\" to retrieve N examples."
 count = 1
-count_param = ARGV.find{|a| (a =~ /--\d*/) == 0}
+count_param = ARGV.find{|a| (a =~ /^-\d*$/) == 0}
 if !count_param.nil?
-  count = count_param.sub("--","").to_i
+  count = count_param.sub("-","").to_i
   ARGV.reject!{|a| a == count_param}
 end
 search = ARGV.join(" ")
-matches = DocumentCache.find_examples_for search, count
+matches = DocumentCache.new.find_examples_for search, count
 exit(1) if matches.empty?
 puts matches.map{|sentence, tokens|

data/bin/frequency-list CHANGED Viewed

@@ -1,14 +1,18 @@
 #!/usr/bin/env ruby
-# Call with a file to list words by the frequency of their stems
-# Call with no arguments to list the frequencies of the words in the vocabulary chest.
 require_relative '../lib/document-cache'
 require_relative '../lib/vocabulary-chest'
+require_relative '../lib/cli'
+TAU::CLI.intercept help: "Pass in a file to list the frequency of each word inside. Or call without argument to list the frequency for each word in your vocabulary chest based on the documents in your cache.\n\nYou can use the switch \"--unknown\" to only include unknown words (useful to figure out what to learn next)."
+@cache = DocumentCache.new
+@chest = VocabularyChest.new
 def frequencies text=nil
-  text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text} if text.nil?
+  text = @cache.documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text} if text.nil?
-  frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
+  frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[@chest.stem(w)] << w; hash }
   frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
 end
@@ -19,7 +23,7 @@ def output frequencies
 end
 if ARGV[0] == "--unknown"
-	output frequencies.find_all{|k,v| !VocabularyChest::is_known?(v[0])}
+	output frequencies.find_all{|k,v| !@chest.is_known?(v[0])}
 elsif ARGV.empty?
 	output frequencies
 else

data/bin/percentage-known-of CHANGED Viewed

@@ -1,12 +1,18 @@
 #!/usr/bin/env ruby
 require_relative '../lib/vocabulary-chest'
+require 'colorize'
+require_relative '../lib/cli'
+TAU::CLI.intercept help: "Pass in a file to calculate how much vocabulary you know from its content. Or do not pass in any argument and paste a bit of text. Use Control-D when you are done pasting."
+@chest = VocabularyChest.new
 def analyse text
 	words = text.split(" ")
-	known = words.select{|w| VocabularyChest.is_known? w}
+	known = words.select{|w| @chest.is_known? w}
 	unknown = (words - known)
-	return [known.map{|w| VocabularyChest::sanitize w}.uniq, unknown.map{|w| VocabularyChest::sanitize w}.uniq]
+	return [known.map{|w| @chest.sanitize w}.uniq, unknown.map{|w| @chest.sanitize w}.uniq]
 end
 def output options
@@ -14,8 +20,8 @@ def output options
 	size = known.size + unknown.size
 	puts
-	puts "--"
-	puts "UNKNOWN WORDS: #{unknown.join(", ")}"
+  puts "—".blue
+	puts "#{'Unknown words'.red}: #{unknown.join(", ")}"
 	puts
 	puts "Total number of unknown words: #{unknown.size}"
 	puts "Total number of known words: #{known.size}"

data/bin/play-with-examples CHANGED Viewed

@@ -3,6 +3,12 @@
 require_relative '../lib/document-cache'
 require_relative '../lib/vocabulary-chest'
 require_relative '../lib/game'
+require_relative '../lib/cli'
+TAU::CLI.intercept help: "This script is used to play fill-in-the-blanks using example sentences from your document cache.\n\nPass in a file with a word on each line. Or just call without arguments to paste text, one word per line. Use Control-D when you are done pasting."
+@cache = DocumentCache.new
 def get_input
   if !ARGV.empty?
@@ -14,14 +20,14 @@ end
 input = get_input
 words = input.split("\n")
-words.reject!{|w| STDOUT.write("."); STDOUT.flush; DocumentCache.find_examples_for(w).empty?}
+words.reject!{|w| STDOUT.write("."); STDOUT.flush; @cache.find_examples_for(w).empty?}
 puts
 puts "Playing with #{words.size} words."
 Game.new(words).play{ |word|
-  matches = DocumentCache.find_examples_for(word, 10).keys
+  matches = @cache.find_examples_for(word, 10).keys
   sentence = matches.sort{|a, b| a.size <=> b.size}.first
-  correct_answer = DocumentCache::extract_matching_words(word, sentence).first
+  correct_answer = @cache.extract_matching_words(word, sentence).first
   [sentence, correct_answer]
 }

data/bin/readability-of CHANGED Viewed

@@ -1,31 +1,22 @@
 #!/usr/bin/env ruby
+require_relative '../lib/algorithms'
+require_relative '../lib/cli'
+TAU::CLI.intercept help: "Pass in a file to calculate how readable is the content. Or just call without argument and paste a bit of text. Use Control-D when you are done pasting."
 def analyse text
 	words = text.split(" ").size.to_f
 	sentences = text.split(/\.|\?|!/).reject{|s| s.strip.empty?}.size.to_f
-	syllables = text.split(" ").inject([]){|sum, w| sum + vowels(w)}
-	syllables = syllables.size.to_f * 0.9 # for silent vowels
-	words_with_more_than_three_syllables = text.split(" ").select{|w| vowels(w).size >= 3}
+	syllable_count = text.split(" ").inject(0){|sum, w| sum + Algorithms::syllable_count(w)}
+	words_with_more_than_three_syllables = text.split(" ").select{|w| Algorithms::syllable_count(w) >= 3}
 	ms = words_with_more_than_three_syllables.size.to_f / text.split(" ").size.to_f * 100
   stats = {:words => words,
 					 :sentences => sentences,
-					 :syllables => syllables,
+					 :syllables => syllable_count,
 					 :ms => ms,
-					 :wiener_sachtextformel => wiener_sachtextformel(sentences, words, ms),
-					 :grade_level => grade(sentences, words, syllables)}
-end
-def wiener_sachtextformel sentences, words, ms
-	0.2656 * (words / sentences) + 0.2744 * ms -1.693
-end
-def grade sentences, words, syllables
-	(0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59)
-end
-def vowels w
-	w.split(/b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|z/i).reject{|s|s.empty?}
+					 :wiener_sachtextformel => Algorithms::wiener_sachtextformel(sentences, words, ms),
+					 :grade_level => Algorithms::grade(sentences, words, syllable_count)}
 end
 def output options
@@ -35,11 +26,11 @@ def output options
 	end
 	puts "Number of sentences: #{options[:sentences]}"
 	puts "Number of words: #{options[:words]}"
-	puts "Number of syllabes: #{options[:syllables]}"
+	puts "Number of syllables: #{options[:syllables]}"
 	puts "Average number of syllables per word: #{ '%.2f' % (options[:syllables] / options[:words])}"
 	puts "Average number of words per sentence: #{'%.2f' % (options[:words] / options[:sentences])}"
 	puts "Wiener Sachtextformel: #{'%.2f' % options[:wiener_sachtextformel]}"
-	puts "Flesch-Kincaid Grade Level: #{'%.2f' % options[:grade_level]}"
+	puts "Flesch–Kincaid Grade Level: #{'%.2f' % options[:grade_level]}"
 	puts
 end

data/bin/set-text-language ADDED Viewed

@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+require_relative '../lib/tau_config'
+require_relative '../lib/cli'
+help = "Pass in one argument: the language you are currently learning (fr|de|es|en|...)"
+TAU::CLI.intercept help: help
+(puts help; exit 1) if ARGV.empty?
+TAUConfig.language = ARGV[0]

data/bin/vocabulary-size CHANGED Viewed

@@ -1,6 +1,10 @@
 #!/usr/bin/env ruby
 require_relative '../lib/vocabulary-chest'
+require_relative '../lib/cli'
+TAU::CLI.intercept
+size = VocabularyChest.new.known_words.size
-size = VocabularyChest.known_words.size
 puts "You know #{size} words."

data/lib/algorithms.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module Algorithms
+  def self.syllable_count word # see https://stackoverflow.com/questions/1271918/ruby-count-syllables
+    word.downcase!
+    return 1 if word.length <= 3
+    word.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '')
+    word.sub!(/^y/, '')
+    word.scan(/[aeiouy]{1,2}/).size
+  end
+  def self.wiener_sachtextformel sentences, words, ms
+    (0.2656 * (words / sentences) + 0.2744 * ms -1.693).round(2)
+  end
+  def self.grade sentences, words, syllables
+    (0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59).round(2)
+  end
+end

data/lib/cli.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require_relative './version'
+module TAU
+  class CLI
+    def self.intercept options={}
+      if ARGV.include? '--help'
+        if options.has_key? :help
+          puts options[:help]
+          exit 0
+        end
+        puts "This script is part of a set of utilities for text analysis. For more information, see https://github.com/matstc/text-analysis-utils"
+        exit 0
+      end
+      if ARGV.include? '--version' or ARGV.include? '-v'
+        puts "text-analysis-utils v#{TAU::VERSION}"
+        exit 0
+      end
+    end
+  end
+end

data/lib/document-cache.rb CHANGED Viewed

@@ -4,30 +4,34 @@ require 'uuid'
 require_relative 'tau_config'
 require_relative 'vocabulary-chest'
-module DocumentCache
-	def self.add document
+class DocumentCache
+  def initialize
+    @chest = VocabularyChest.new
+  end
+	def add document
 		filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}"
 		File.open(filename,'w'){|f| f.write(document)}
 	end
-	def self.find_matches_by_stemming search, sentences
-		token = VocabularyChest::stem(search)
+	def find_matches_by_stemming search, sentences
+		token = @chest.stem(search)
 		sentences.inject({}){|hash, s|
 			words = s.split(" ")
-			found = words.select{|w| VocabularyChest::stem(w) == token}
-			hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty?
+			found = words.select{|w| @chest.stem(w) == token}
+			hash[clean(s)] = found.map{|f| @chest.sanitize(f)} if !found.empty?
 			hash
 		}
 	end
-	def self.find_matches_by_grepping search, sentences
+	def find_matches_by_grepping search, sentences
 		sentences.inject({}){|hash, s|
 			hash[clean(s)] = [search] if s.include? search
 			hash
 		}
 	end
-	def self.find_matches_in filenames, search, count
+	def find_matches_in filenames, search, count
 		matches = {}
 		[:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
@@ -46,25 +50,29 @@ module DocumentCache
 		matches
 	end
-	def self.documents
+	def documents
 		Dir["#{TAUConfig::cache_dir}/*"]
 	end
-	def self.find_examples_for search, count=1
+  def clear
+    documents.each {|doc| FileUtils.rm_rf doc}
+  end
+	def find_examples_for search, count=1
 		find_matches_in documents, search, count
 	end
-	def self.clean(sentence)
+	def clean(sentence)
 		sentence.strip + "."
 	end
-	def self.extract_matching_words search, sentence
+	def extract_matching_words search, sentence
 		matches = find_matches_by_stemming(search, [sentence])
 		return matches.values.first if !matches.empty?
 		return find_matches_by_grepping(search, [sentence]).values.first
 	end
-	def self.frequency_list
+	def frequency_list
 		text = ""
 		documents.each{|f| text += File.open(f).read }
 		counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
@@ -72,10 +80,10 @@ module DocumentCache
 		counts.sort_by {|k,v| v}.reverse
 	end
-	def self.stemmed_frequency_list
+	def stemmed_frequency_list
 		text = ""
 		documents.each{|f| text += File.open(f).read }
-		stems = text.split(" ").map{|w| VocabularyChest::stem(w)}
+		stems = text.split(" ").map{|w| @chest.stem(w)}
 		counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
 		counts.reject!{|stem, count| count < 2}
 		counts.sort_by {|k,v| v}.reverse

data/lib/game.rb CHANGED Viewed

@@ -105,7 +105,7 @@ class Game
 	end
 	def play &block
-    (puts "Could not find any words to play with."; exit 1) if @words.empty?
+    (puts "Could not find any words to play with."; return) if @words.empty?
 		@words.shuffle.each{|word|
 			@turn += 1

data/lib/tau_config.rb CHANGED Viewed

@@ -1,22 +1,42 @@
 require 'fileutils.rb'
 module TAUConfig
+  def self.language
+    language = File.open(language_file,'r') { |f| f.read }
+    return language unless language.empty?
+    ENV['vocabulary_chest_language'] || "en"
+  end
+  def self.language= language
+    File.open(language_file,'w') do |f|
+      f.write language
+    end
+  end
   def self.root_dir
     File.expand_path(ENV['vocabulary_chest_location'] || "~/.vocabulary-chest")
   end
   def self.known_file
     "#{root_dir}/known"
   end
   def self.unknown_file
     "#{root_dir}/unknown"
   end
   def self.cache_dir
     "#{root_dir}/docs"
   end
+  def self.language_file
+    "#{root_dir}/language"
+  end
 end
 FileUtils::mkdir_p TAUConfig.root_dir
+FileUtils::touch TAUConfig.language_file
 FileUtils::touch TAUConfig.known_file
 FileUtils::touch TAUConfig.unknown_file
 FileUtils::mkdir_p TAUConfig.root_dir

data/lib/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module TAU
+  VERSION = '0.7.0'
+end

data/lib/vocabulary-chest.rb CHANGED Viewed

@@ -5,43 +5,45 @@ require 'lingua/stemmer'
 require_relative 'tau_config'
-module VocabularyChest
-	@known_file = File.open(TAUConfig.known_file,'a')
-	@unknown_file = File.open(TAUConfig.unknown_file,'a')
-	@stemmer= Lingua::Stemmer.new(:language => ENV['vocabulary_chest_language'] || "en")
-	def self.known_words
+class VocabularyChest
+  def initialize
+    @known_file = File.open(TAUConfig.known_file,'a')
+    @unknown_file = File.open(TAUConfig.unknown_file,'a')
+    @stemmer= Lingua::Stemmer.new(:language => TAUConfig.language)
+  end
+	def known_words
 		File.open(@known_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
 	end
-	def self.unknown_words
+	def unknown_words
 		File.open(@unknown_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
 	end
-	def self.add_to_known_words word
+	def add_to_known_words word
 		@known_file.puts(stem word)
 		@known_file.flush
 	end
-	def self.add_to_unknown_words word
+	def add_to_unknown_words word
 		@unknown_file.puts(stem word)
 		@unknown_file.flush
 	end
-	def self.contains? word
+	def contains? word
 		stemmed_word = stem word
 		known_words.include?(stemmed_word) or unknown_words.include?(stemmed_word)
 	end
-	def self.is_known? word
+	def is_known? word
 		known_words.include?(stem(word)) or sanitize(word).empty? or (sanitize(word) =~ /^[-\d]*$/) != nil
 	end
-	def self.stem word
+	def stem word
 		@stemmer.stem(sanitize word).downcase
 	end
-	def self.sanitize word
+	def sanitize word
 		word.gsub(/[,\"\.:;()?!„“]/,"")
 	end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text-analysis-utils
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  version: 0.7.0
 platform: ruby
 authors:
 - '@matstc'
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-02-26 00:00:00.000000000 Z
+date: 2014-02-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: colorize
@@ -66,10 +66,10 @@ dependencies:
     - - '>='
       - !ruby/object:Gem::Version
         version: '0'
-description: Utilities to help language learners
+description: Utilities to help language learners acquire vocabulary
 email:
 executables:
-- proximity-of-words
+- set-text-language
 - percentage-known-of
 - readability-of
 - vocabulary-size
@@ -83,10 +83,13 @@ extra_rdoc_files: []
 files:
 - lib/game.rb
 - lib/document-cache.rb
+- lib/cli.rb
 - lib/tau_config.rb
+- lib/algorithms.rb
 - lib/text-analysis-utils.rb
+- lib/version.rb
 - lib/vocabulary-chest.rb
-- bin/proximity-of-words
+- bin/set-text-language
 - bin/percentage-known-of
 - bin/readability-of
 - bin/vocabulary-size
@@ -118,6 +121,6 @@ rubyforge_project:
 rubygems_version: 2.1.11
 signing_key:
 specification_version: 4
-summary: Utilities to help language learners
+summary: Utilities to help language learners acquire vocabulary
 test_files: []
 has_rdoc:

data/bin/proximity-of-words DELETED Viewed

@@ -1,43 +0,0 @@
-#!/usr/bin/env ruby
-require 'amatch'
-def distance w1, w2
-	Amatch::Levenshtein.new(w1).match(w2)
-end
-def analyse text, known_text
-	words = words_of(text)
-	known_words = words_of(known_text)
-	words.map {|w|
-		closest_word, proximity = find_closest_word(w, known_words)
-		puts "#{w}\t#{closest_word}\t#{proximity}"
-		STDOUT.flush
-	}
-end
-def find_closest_word word, known_words
-	closest_word = known_words.sort{|w1, w2| distance(w1, word) <=> distance(w2, word)}.first
-	[closest_word, distance(closest_word, word)]
-end
-def words_of text
-	words = text.split(" ").uniq
-end
-if ARGV.size < 2
-  puts "usage: #{$0} file_with_new_words file_with_known_words"
-  exit 1
-end
-filename = ARGV.shift
-text = File.open(filename,'r'){|file| file.read}
-known_text ||= ""
-ARGV.each {|filename|
-  known_text += File.open(filename,'r'){|file| file.read}
-}
-analyse text, known_text