text-analysis-utils 0.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d7f8baa8dddefcdf0e0cb1897a714eef20cecdc9
4
+ data.tar.gz: 76e10901cb1e270ca4f1f95d5cee3425b06a1656
5
+ SHA512:
6
+ metadata.gz: 616397557e53efe884f01f59900221fb93a97dba1ff0ab0769882d1b7bc5b3d0f6482896d9045ce2986a53321e3b832726c2703ebd107ce39d6d93fa1cfa85b2
7
+ data.tar.gz: 739aa1e163b08bd5869a1b29887daac6d7334fdcdcd122062b3a7d57c8fb10679fcf9ec208e9be1075834f331e40637bee81121fc7efc2c93f5f2c911e602f1d
data/bin/cache-document CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
3
+ require_relative '../lib/document-cache'
4
4
 
5
5
  def get_text
6
6
  if ARGV.empty?
@@ -2,8 +2,8 @@
2
2
 
3
3
  require 'rubygems'
4
4
  require 'colorize'
5
- require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
6
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
5
+ require_relative '../lib/vocabulary-chest'
6
+ require_relative '../lib/document-cache'
7
7
 
8
8
 
9
9
  def get_text
@@ -20,7 +20,6 @@ end
20
20
 
21
21
  def ask word, index, words, text
22
22
  location = (text =~ /\b#{Regexp.escape(word)}\b/)
23
- puts "!!!!" if word == "notwendig"
24
23
  location = text.index(word) if location.nil?
25
24
  (puts "Skipping word: #{word}"; return 'skip') if location.nil?
26
25
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  require 'rubygems'
4
4
  require 'colorize'
5
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
5
+ require_relative '../lib/document-cache'
6
6
 
7
7
  count = 1
8
8
  count_param = ARGV.find{|a| (a =~ /--\d*/) == 0}
data/bin/frequency-list CHANGED
@@ -1,13 +1,17 @@
1
1
  #!/usr/bin/env ruby
2
+ # Call with a file to list words by the frequency of their stems
3
+ # Call with no arguments to list the frequencies of the words in the vocabulary chest.
2
4
 
3
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
- require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
5
+ require_relative '../lib/document-cache'
6
+ require_relative '../lib/vocabulary-chest'
5
7
 
6
- text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text}
8
+ def frequencies text=nil
9
+ text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text} if text.nil?
7
10
 
8
- frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
11
+ frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
9
12
 
10
- frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
13
+ frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
14
+ end
11
15
 
12
16
  def output frequencies
13
17
  STDOUT.sync = true
@@ -16,6 +20,8 @@ end
16
20
 
17
21
  if ARGV[0] == "--unknown"
18
22
  output frequencies.find_all{|k,v| !VocabularyChest::is_known?(v[0])}
19
- else
23
+ elsif ARGV.empty?
20
24
  output frequencies
25
+ else
26
+ output(frequencies(File.read(ARGV[0])))
21
27
  end
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
4
- require File.join(File.dirname(__FILE__), '../lib/lookup' )
3
+ require_relative '../lib/vocabulary-chest'
5
4
 
6
5
  def analyse text
7
6
  words = text.split(" ")
@@ -18,10 +17,6 @@ def output options
18
17
  puts "--"
19
18
  puts "UNKNOWN WORDS: #{unknown.join(", ")}"
20
19
  puts
21
- puts "DEFINITIONS"
22
- puts Lookup::go(unknown)
23
- puts "--"
24
- puts
25
20
  puts "Total number of unknown words: #{unknown.size}"
26
21
  puts "Total number of known words: #{known.size}"
27
22
  puts "Total number of words: #{size}"
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
- require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
5
- require File.join(File.dirname(__FILE__), '../lib/game' )
3
+ require_relative '../lib/document-cache'
4
+ require_relative '../lib/vocabulary-chest'
5
+ require_relative '../lib/game'
6
6
 
7
7
  def get_input
8
8
  if !ARGV.empty?
@@ -17,6 +17,8 @@ words = input.split("\n")
17
17
  words.reject!{|w| STDOUT.write("."); STDOUT.flush; DocumentCache.find_examples_for(w).empty?}
18
18
  puts
19
19
 
20
+ puts "Playing with #{words.size} words."
21
+
20
22
  Game.new(words).play{ |word|
21
23
  matches = DocumentCache.find_examples_for(word, 10).keys
22
24
  sentence = matches.sort{|a, b| a.size <=> b.size}.first
@@ -1,6 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'rubygems'
4
3
  require 'amatch'
5
4
 
6
5
  def distance w1, w2
@@ -29,7 +28,7 @@ def words_of text
29
28
  end
30
29
 
31
30
  if ARGV.size < 2
32
- puts "usage: ./script <new text> <known text>"
31
+ puts "usage: #{$0} file_with_new_words file_with_known_words"
33
32
  exit 1
34
33
  end
35
34
 
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/vocabulary-chest'
4
+
5
+ size = VocabularyChest.known_words.size
6
+ puts "You know #{size} words."
@@ -1,18 +1,13 @@
1
1
  require 'fileutils.rb'
2
- require 'rubygems'
3
2
  require 'uuid'
4
3
 
5
- require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
6
-
7
- CACHE_DIR = "#{ROOT_DIR}/docs"
8
-
9
- FileUtils::mkdir_p(ROOT_DIR)
10
- FileUtils::mkdir_p(CACHE_DIR)
4
+ require_relative 'tau_config'
5
+ require_relative 'vocabulary-chest'
11
6
 
12
7
  module DocumentCache
13
- def self.add search
14
- filename = "#{CACHE_DIR}/#{UUID.new.generate}"
15
- File.open(filename,'w'){|f| f.write(search)}
8
+ def self.add document
9
+ filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}"
10
+ File.open(filename,'w'){|f| f.write(document)}
16
11
  end
17
12
 
18
13
  def self.find_matches_by_stemming search, sentences
@@ -52,7 +47,7 @@ module DocumentCache
52
47
  end
53
48
 
54
49
  def self.documents
55
- Dir["#{CACHE_DIR}/*"]
50
+ Dir["#{TAUConfig::cache_dir}/*"]
56
51
  end
57
52
 
58
53
  def self.find_examples_for search, count=1
@@ -86,13 +81,3 @@ module DocumentCache
86
81
  counts.sort_by {|k,v| v}.reverse
87
82
  end
88
83
  end
89
-
90
- if __FILE__ == $0
91
- puts "The document cache contains #{DocumentCache.documents.size} documents."
92
- puts
93
- puts "Here are the 10 most frequent stems:"
94
- DocumentCache.stemmed_frequency_list[0,10].each{|stem, count| puts "#{count} #{stem}"}
95
- puts
96
- puts "Here are the 10 most frequent words:"
97
- DocumentCache.frequency_list[0,10].each{|word, count| puts "#{count} #{word}"}
98
- end
data/lib/game.rb CHANGED
@@ -1,3 +1,5 @@
1
+ #encoding: UTF-8
2
+
1
3
  require 'rubygems'
2
4
  require 'amatch'
3
5
  require 'colorize'
@@ -103,6 +105,8 @@ class Game
103
105
  end
104
106
 
105
107
  def play &block
108
+ (puts "Could not find any words to play with."; exit 1) if @words.empty?
109
+
106
110
  @words.shuffle.each{|word|
107
111
  @turn += 1
108
112
 
@@ -119,4 +123,3 @@ class Game
119
123
  play(&block)
120
124
  end
121
125
  end
122
-
data/lib/tau_config.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'fileutils.rb'
2
+
3
+ module TAUConfig
4
+ def self.root_dir
5
+ File.expand_path(ENV['vocabulary_chest_location'] || "~/.vocabulary-chest")
6
+ end
7
+ def self.known_file
8
+ "#{root_dir}/known"
9
+ end
10
+ def self.unknown_file
11
+ "#{root_dir}/unknown"
12
+ end
13
+ def self.cache_dir
14
+ "#{root_dir}/docs"
15
+ end
16
+
17
+ end
18
+
19
+ FileUtils::mkdir_p TAUConfig.root_dir
20
+ FileUtils::touch TAUConfig.known_file
21
+ FileUtils::touch TAUConfig.unknown_file
22
+ FileUtils::mkdir_p TAUConfig.root_dir
23
+ FileUtils::mkdir_p TAUConfig.cache_dir
@@ -1,4 +1,4 @@
1
- require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
2
- require File.join(File.dirname(__FILE__), 'document-cache' )
3
- require File.join(File.dirname(__FILE__), 'game' )
4
-
1
+ require_relative 'config'
2
+ require_relative 'vocabulary-chest'
3
+ require_relative 'document-cache'
4
+ require_relative 'game'
@@ -3,30 +3,19 @@ require 'fileutils.rb'
3
3
  require 'rubygems'
4
4
  require 'lingua/stemmer'
5
5
 
6
- ROOT_DIR = File.expand_path(ENV['chest_location'] || "~/.vocabulary-chest")
7
- KNOWN_FILE = "#{ROOT_DIR}/known"
8
- UNKNOWN_FILE = "#{ROOT_DIR}/unknown"
9
-
10
- FileUtils::mkdir_p(ROOT_DIR)
11
- FileUtils.touch(KNOWN_FILE)
12
- FileUtils.touch(UNKNOWN_FILE)
6
+ require_relative 'tau_config'
13
7
 
14
8
  module VocabularyChest
15
- @known_file = File.open(KNOWN_FILE,'a')
16
- @unknown_file = File.open(UNKNOWN_FILE,'a')
17
- @known_words = nil
18
- @unknown_words = nil
19
- @stemmer= Lingua::Stemmer.new(:language => "de")
20
-
21
- at_exit {@known_file.close}
22
- at_exit {@unknown_file.close}
9
+ @known_file = File.open(TAUConfig.known_file,'a')
10
+ @unknown_file = File.open(TAUConfig.unknown_file,'a')
11
+ @stemmer= Lingua::Stemmer.new(:language => ENV['vocabulary_chest_language'] || "en")
23
12
 
24
13
  def self.known_words
25
- @known_words ||= File.open(KNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
14
+ File.open(@known_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
26
15
  end
27
16
 
28
17
  def self.unknown_words
29
- @unknown_words ||= File.open(UNKNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
18
+ File.open(@unknown_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
30
19
  end
31
20
 
32
21
  def self.add_to_known_words word
@@ -56,9 +45,3 @@ module VocabularyChest
56
45
  word.gsub(/[,\"\.:;()?!„“]/,"")
57
46
  end
58
47
  end
59
-
60
- if __FILE__ == $0
61
- known = VocabularyChest::known_words
62
- unknown = VocabularyChest::unknown_words
63
- puts "The chest contains #{known.size} known words."
64
- end
metadata CHANGED
@@ -1,148 +1,123 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: text-analysis-utils
3
- version: !ruby/object:Gem::Version
4
- hash: 13
5
- prerelease:
6
- segments:
7
- - 0
8
- - 3
9
- version: "0.3"
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.0
10
5
  platform: ruby
11
- authors:
12
- - Matt
6
+ authors:
7
+ - '@matstc'
13
8
  autorequire:
14
9
  bindir: bin
15
10
  cert_chain: []
16
-
17
- date: 2012-06-23 00:00:00 Z
18
- dependencies:
19
- - !ruby/object:Gem::Dependency
11
+ date: 2014-02-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
20
14
  name: colorize
21
- prerelease: false
22
- requirement: &id001 !ruby/object:Gem::Requirement
23
- none: false
24
- requirements:
25
- - - ">="
26
- - !ruby/object:Gem::Version
27
- hash: 3
28
- segments:
29
- - 0
30
- version: "0"
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
31
20
  type: :runtime
32
- version_requirements: *id001
33
- - !ruby/object:Gem::Dependency
34
- name: amatch
35
21
  prerelease: false
36
- requirement: &id002 !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - ">="
40
- - !ruby/object:Gem::Version
41
- hash: 3
42
- segments:
43
- - 0
44
- version: "0"
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: amatch
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
45
34
  type: :runtime
46
- version_requirements: *id002
47
- - !ruby/object:Gem::Dependency
48
- name: ruby-stemmer
49
35
  prerelease: false
50
- requirement: &id003 !ruby/object:Gem::Requirement
51
- none: false
52
- requirements:
53
- - - ">="
54
- - !ruby/object:Gem::Version
55
- hash: 3
56
- segments:
57
- - 0
58
- version: "0"
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: ruby-stemmer
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
59
48
  type: :runtime
60
- version_requirements: *id003
61
- - !ruby/object:Gem::Dependency
62
- name: uuid
63
49
  prerelease: false
64
- requirement: &id004 !ruby/object:Gem::Requirement
65
- none: false
66
- requirements:
67
- - - ">="
68
- - !ruby/object:Gem::Version
69
- hash: 3
70
- segments:
71
- - 0
72
- version: "0"
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: uuid
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
73
62
  type: :runtime
74
- version_requirements: *id004
75
- description:
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Utilities to help language learners
76
70
  email:
77
- executables:
78
- - cache-document
79
- - classify-new-words
80
- - find-examples-for
81
- - frequency-list
82
- - lookup
83
- - percentage-known-of
84
- - play-with-blanks
85
- - play-with-examples
86
- - prepare-text
71
+ executables:
87
72
  - proximity-of-words
73
+ - percentage-known-of
88
74
  - readability-of
89
- - vocabulary-coverage
75
+ - vocabulary-size
76
+ - classify-new-words
77
+ - play-with-examples
78
+ - cache-document
79
+ - frequency-list
80
+ - find-examples-for
90
81
  extensions: []
91
-
92
82
  extra_rdoc_files: []
93
-
94
- files:
95
- - lib/text-analysis-utils.rb
83
+ files:
84
+ - lib/game.rb
96
85
  - lib/document-cache.rb
86
+ - lib/tau_config.rb
87
+ - lib/text-analysis-utils.rb
97
88
  - lib/vocabulary-chest.rb
98
- - lib/game.rb
99
- - lib/lookup.rb
100
- - bin/cache-document
101
- - bin/classify-new-words
102
- - bin/find-examples-for
103
- - bin/frequency-list
104
- - bin/lookup
105
- - bin/percentage-known-of
106
- - bin/play-with-blanks
107
- - bin/play-with-examples
108
- - bin/prepare-text
109
89
  - bin/proximity-of-words
90
+ - bin/percentage-known-of
110
91
  - bin/readability-of
111
- - bin/vocabulary-coverage
92
+ - bin/vocabulary-size
93
+ - bin/classify-new-words
94
+ - bin/play-with-examples
95
+ - bin/cache-document
96
+ - bin/frequency-list
97
+ - bin/find-examples-for
112
98
  homepage: http://github.com/matstc/text-analysis-utils
113
- licenses: []
114
-
99
+ licenses:
100
+ - CC-BY-NC-SA 4.0
101
+ metadata: {}
115
102
  post_install_message:
116
103
  rdoc_options: []
117
-
118
- require_paths:
104
+ require_paths:
119
105
  - lib
120
- required_ruby_version: !ruby/object:Gem::Requirement
121
- none: false
122
- requirements:
123
- - - ">="
124
- - !ruby/object:Gem::Version
125
- hash: 57
126
- segments:
127
- - 1
128
- - 8
129
- - 7
130
- version: 1.8.7
131
- required_rubygems_version: !ruby/object:Gem::Requirement
132
- none: false
133
- requirements:
134
- - - ">="
135
- - !ruby/object:Gem::Version
136
- hash: 3
137
- segments:
138
- - 0
139
- version: "0"
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: 2.0.0
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
140
116
  requirements: []
141
-
142
117
  rubyforge_project:
143
- rubygems_version: 1.8.15
118
+ rubygems_version: 2.1.11
144
119
  signing_key:
145
- specification_version: 3
120
+ specification_version: 4
146
121
  summary: Utilities to help language learners
147
122
  test_files: []
148
-
123
+ has_rdoc:
data/bin/lookup DELETED
@@ -1,9 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.join(File.dirname(__FILE__), '../lib/lookup' )
4
-
5
- if !ARGV.empty?
6
- puts Lookup::go ARGV
7
- else
8
- puts Lookup::go STDIN.read.split("\n")
9
- end
data/bin/play-with-blanks DELETED
@@ -1,28 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.join(File.dirname(__FILE__), '../lib/game' )
4
-
5
- def get_input
6
- if !ARGV.empty?
7
- else
8
- STDIN.read
9
- end
10
- end
11
-
12
- (puts "Usage: #{$0} <file with words to practice> <file with examples>"; exit(1)) if ARGV.size < 2
13
-
14
- input = File.open(ARGV.shift){|f| f.read}
15
- words = input.split("\n").uniq
16
-
17
- example_sentences = []
18
- ARGV.each{|filename| example_sentences += File.open(filename).readlines}
19
- example_sentences.map!{|s| s.chomp}
20
- example_sentences.reject!{|s| words.find{|w| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
21
- words.reject!{|w| example_sentences.find{|s| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
22
-
23
- puts "Playing with #{example_sentences.size} sentences and #{words.size} words."
24
-
25
- Game.new(words).play{|word|
26
- sentence = example_sentences.shuffle.find{|s| (s =~ /\b#{Regexp.escape(word)}\b/i) != nil}
27
- [sentence, $&]
28
- }
data/bin/prepare-text DELETED
@@ -1,9 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- text = ARGV.empty? ? STDIN.read : ARGV.map{|a|File.open(a){|f|f.read}}.join("\n\n")
4
- File.open("/tmp/prepared-text", 'w'){|f| f.write(text)}
5
- exec("classify-new-words /tmp/prepared-text && \
6
- cache-document /tmp/prepared-text && \
7
- echo '\nREADABILITY STATISTICS' && \
8
- readability-of /tmp/prepared-text && \
9
- percentage-known-of /tmp/prepared-text")
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- command =<<EOF
4
-
5
- total_occurrences=`expr $(frequency-list | awk '{print $1}' | xargs | sed 's/ / + /g')`
6
- unknown_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | xargs | sed 's/ / + /g')`
7
- next_500_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | head -500 | xargs | sed 's/ / + /g')`
8
-
9
- echo Total occurrences: $total_occurrences
10
- echo Unknown occurrences: $unknown_occurrences
11
- echo Your current vocabulary knowledge covers $(echo "scale=2;($total_occurrences - $unknown_occurrences) / $total_occurrences * 100" | bc -q)% of all occurrences
12
- echo The next 500 words will bring your cover to $(echo "scale=2;($total_occurrences - $unknown_occurrences + $next_500_occurrences) / $total_occurrences * 100" | bc -q)%
13
-
14
- EOF
15
-
16
- system command
data/lib/lookup.rb DELETED
@@ -1,15 +0,0 @@
1
- module Lookup
2
-
3
- def self.fetch_definition word
4
- definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
5
- definitions.uniq.join(" -- ")
6
- end
7
-
8
- def self.sanitize word
9
- word.gsub(/[,\.]/,"")
10
- end
11
-
12
- def self.go words
13
- words.map{|w| sanitize w}.map{|w| "#{w}\t#{fetch_definition w}"}.join("\n")
14
- end
15
- end