text-analysis-utils 0.3 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d7f8baa8dddefcdf0e0cb1897a714eef20cecdc9
4
+ data.tar.gz: 76e10901cb1e270ca4f1f95d5cee3425b06a1656
5
+ SHA512:
6
+ metadata.gz: 616397557e53efe884f01f59900221fb93a97dba1ff0ab0769882d1b7bc5b3d0f6482896d9045ce2986a53321e3b832726c2703ebd107ce39d6d93fa1cfa85b2
7
+ data.tar.gz: 739aa1e163b08bd5869a1b29887daac6d7334fdcdcd122062b3a7d57c8fb10679fcf9ec208e9be1075834f331e40637bee81121fc7efc2c93f5f2c911e602f1d
data/bin/cache-document CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
3
+ require_relative '../lib/document-cache'
4
4
 
5
5
  def get_text
6
6
  if ARGV.empty?
@@ -2,8 +2,8 @@
2
2
 
3
3
  require 'rubygems'
4
4
  require 'colorize'
5
- require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
6
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
5
+ require_relative '../lib/vocabulary-chest'
6
+ require_relative '../lib/document-cache'
7
7
 
8
8
 
9
9
  def get_text
@@ -20,7 +20,6 @@ end
20
20
 
21
21
  def ask word, index, words, text
22
22
  location = (text =~ /\b#{Regexp.escape(word)}\b/)
23
- puts "!!!!" if word == "notwendig"
24
23
  location = text.index(word) if location.nil?
25
24
  (puts "Skipping word: #{word}"; return 'skip') if location.nil?
26
25
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  require 'rubygems'
4
4
  require 'colorize'
5
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
5
+ require_relative '../lib/document-cache'
6
6
 
7
7
  count = 1
8
8
  count_param = ARGV.find{|a| (a =~ /--\d*/) == 0}
data/bin/frequency-list CHANGED
@@ -1,13 +1,17 @@
1
1
  #!/usr/bin/env ruby
2
+ # Call with a file to list words by the frequency of their stems
3
+ # Call with no arguments to list the frequencies of the words in the vocabulary chest.
2
4
 
3
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
- require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
5
+ require_relative '../lib/document-cache'
6
+ require_relative '../lib/vocabulary-chest'
5
7
 
6
- text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text}
8
+ def frequencies text=nil
9
+ text = DocumentCache::documents.inject(""){|text, f| text+= File.open(f){|f|f.read}; text} if text.nil?
7
10
 
8
- frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
11
+ frequencies = text.split(" ").inject(Hash.new {|hash,key| hash[key] = []}){|hash, w| hash[VocabularyChest::stem(w)] << w; hash }
9
12
 
10
- frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
13
+ frequencies = frequencies.sort{|a,b| a[1].size <=> b[1].size}.reverse
14
+ end
11
15
 
12
16
  def output frequencies
13
17
  STDOUT.sync = true
@@ -16,6 +20,8 @@ end
16
20
 
17
21
  if ARGV[0] == "--unknown"
18
22
  output frequencies.find_all{|k,v| !VocabularyChest::is_known?(v[0])}
19
- else
23
+ elsif ARGV.empty?
20
24
  output frequencies
25
+ else
26
+ output(frequencies(File.read(ARGV[0])))
21
27
  end
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
4
- require File.join(File.dirname(__FILE__), '../lib/lookup' )
3
+ require_relative '../lib/vocabulary-chest'
5
4
 
6
5
  def analyse text
7
6
  words = text.split(" ")
@@ -18,10 +17,6 @@ def output options
18
17
  puts "--"
19
18
  puts "UNKNOWN WORDS: #{unknown.join(", ")}"
20
19
  puts
21
- puts "DEFINITIONS"
22
- puts Lookup::go(unknown)
23
- puts "--"
24
- puts
25
20
  puts "Total number of unknown words: #{unknown.size}"
26
21
  puts "Total number of known words: #{known.size}"
27
22
  puts "Total number of words: #{size}"
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require File.join(File.dirname(__FILE__), '../lib/document-cache' )
4
- require File.join(File.dirname(__FILE__), '../lib/vocabulary-chest' )
5
- require File.join(File.dirname(__FILE__), '../lib/game' )
3
+ require_relative '../lib/document-cache'
4
+ require_relative '../lib/vocabulary-chest'
5
+ require_relative '../lib/game'
6
6
 
7
7
  def get_input
8
8
  if !ARGV.empty?
@@ -17,6 +17,8 @@ words = input.split("\n")
17
17
  words.reject!{|w| STDOUT.write("."); STDOUT.flush; DocumentCache.find_examples_for(w).empty?}
18
18
  puts
19
19
 
20
+ puts "Playing with #{words.size} words."
21
+
20
22
  Game.new(words).play{ |word|
21
23
  matches = DocumentCache.find_examples_for(word, 10).keys
22
24
  sentence = matches.sort{|a, b| a.size <=> b.size}.first
@@ -1,6 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'rubygems'
4
3
  require 'amatch'
5
4
 
6
5
  def distance w1, w2
@@ -29,7 +28,7 @@ def words_of text
29
28
  end
30
29
 
31
30
  if ARGV.size < 2
32
- puts "usage: ./script <new text> <known text>"
31
+ puts "usage: #{$0} file_with_new_words file_with_known_words"
33
32
  exit 1
34
33
  end
35
34
 
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/vocabulary-chest'
4
+
5
+ size = VocabularyChest.known_words.size
6
+ puts "You know #{size} words."
@@ -1,18 +1,13 @@
1
1
  require 'fileutils.rb'
2
- require 'rubygems'
3
2
  require 'uuid'
4
3
 
5
- require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
6
-
7
- CACHE_DIR = "#{ROOT_DIR}/docs"
8
-
9
- FileUtils::mkdir_p(ROOT_DIR)
10
- FileUtils::mkdir_p(CACHE_DIR)
4
+ require_relative 'tau_config'
5
+ require_relative 'vocabulary-chest'
11
6
 
12
7
  module DocumentCache
13
- def self.add search
14
- filename = "#{CACHE_DIR}/#{UUID.new.generate}"
15
- File.open(filename,'w'){|f| f.write(search)}
8
+ def self.add document
9
+ filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}"
10
+ File.open(filename,'w'){|f| f.write(document)}
16
11
  end
17
12
 
18
13
  def self.find_matches_by_stemming search, sentences
@@ -52,7 +47,7 @@ module DocumentCache
52
47
  end
53
48
 
54
49
  def self.documents
55
- Dir["#{CACHE_DIR}/*"]
50
+ Dir["#{TAUConfig::cache_dir}/*"]
56
51
  end
57
52
 
58
53
  def self.find_examples_for search, count=1
@@ -86,13 +81,3 @@ module DocumentCache
86
81
  counts.sort_by {|k,v| v}.reverse
87
82
  end
88
83
  end
89
-
90
- if __FILE__ == $0
91
- puts "The document cache contains #{DocumentCache.documents.size} documents."
92
- puts
93
- puts "Here are the 10 most frequent stems:"
94
- DocumentCache.stemmed_frequency_list[0,10].each{|stem, count| puts "#{count} #{stem}"}
95
- puts
96
- puts "Here are the 10 most frequent words:"
97
- DocumentCache.frequency_list[0,10].each{|word, count| puts "#{count} #{word}"}
98
- end
data/lib/game.rb CHANGED
@@ -1,3 +1,5 @@
1
+ #encoding: UTF-8
2
+
1
3
  require 'rubygems'
2
4
  require 'amatch'
3
5
  require 'colorize'
@@ -103,6 +105,8 @@ class Game
103
105
  end
104
106
 
105
107
  def play &block
108
+ (puts "Could not find any words to play with."; exit 1) if @words.empty?
109
+
106
110
  @words.shuffle.each{|word|
107
111
  @turn += 1
108
112
 
@@ -119,4 +123,3 @@ class Game
119
123
  play(&block)
120
124
  end
121
125
  end
122
-
data/lib/tau_config.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'fileutils.rb'
2
+
3
+ module TAUConfig
4
+ def self.root_dir
5
+ File.expand_path(ENV['vocabulary_chest_location'] || "~/.vocabulary-chest")
6
+ end
7
+ def self.known_file
8
+ "#{root_dir}/known"
9
+ end
10
+ def self.unknown_file
11
+ "#{root_dir}/unknown"
12
+ end
13
+ def self.cache_dir
14
+ "#{root_dir}/docs"
15
+ end
16
+
17
+ end
18
+
19
+ FileUtils::mkdir_p TAUConfig.root_dir
20
+ FileUtils::touch TAUConfig.known_file
21
+ FileUtils::touch TAUConfig.unknown_file
22
+ FileUtils::mkdir_p TAUConfig.root_dir
23
+ FileUtils::mkdir_p TAUConfig.cache_dir
@@ -1,4 +1,4 @@
1
- require File.join(File.dirname(__FILE__), 'vocabulary-chest' )
2
- require File.join(File.dirname(__FILE__), 'document-cache' )
3
- require File.join(File.dirname(__FILE__), 'game' )
4
-
1
+ require_relative 'config'
2
+ require_relative 'vocabulary-chest'
3
+ require_relative 'document-cache'
4
+ require_relative 'game'
@@ -3,30 +3,19 @@ require 'fileutils.rb'
3
3
  require 'rubygems'
4
4
  require 'lingua/stemmer'
5
5
 
6
- ROOT_DIR = File.expand_path(ENV['chest_location'] || "~/.vocabulary-chest")
7
- KNOWN_FILE = "#{ROOT_DIR}/known"
8
- UNKNOWN_FILE = "#{ROOT_DIR}/unknown"
9
-
10
- FileUtils::mkdir_p(ROOT_DIR)
11
- FileUtils.touch(KNOWN_FILE)
12
- FileUtils.touch(UNKNOWN_FILE)
6
+ require_relative 'tau_config'
13
7
 
14
8
  module VocabularyChest
15
- @known_file = File.open(KNOWN_FILE,'a')
16
- @unknown_file = File.open(UNKNOWN_FILE,'a')
17
- @known_words = nil
18
- @unknown_words = nil
19
- @stemmer= Lingua::Stemmer.new(:language => "de")
20
-
21
- at_exit {@known_file.close}
22
- at_exit {@unknown_file.close}
9
+ @known_file = File.open(TAUConfig.known_file,'a')
10
+ @unknown_file = File.open(TAUConfig.unknown_file,'a')
11
+ @stemmer= Lingua::Stemmer.new(:language => ENV['vocabulary_chest_language'] || "en")
23
12
 
24
13
  def self.known_words
25
- @known_words ||= File.open(KNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
14
+ File.open(@known_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
26
15
  end
27
16
 
28
17
  def self.unknown_words
29
- @unknown_words ||= File.open(UNKNOWN_FILE,'r'){|f|f.readlines}.collect{|line| line.chomp}
18
+ File.open(@unknown_file,'r'){|f|f.readlines}.collect{|line| line.chomp}
30
19
  end
31
20
 
32
21
  def self.add_to_known_words word
@@ -56,9 +45,3 @@ module VocabularyChest
56
45
  word.gsub(/[,\"\.:;()?!„“]/,"")
57
46
  end
58
47
  end
59
-
60
- if __FILE__ == $0
61
- known = VocabularyChest::known_words
62
- unknown = VocabularyChest::unknown_words
63
- puts "The chest contains #{known.size} known words."
64
- end
metadata CHANGED
@@ -1,148 +1,123 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: text-analysis-utils
3
- version: !ruby/object:Gem::Version
4
- hash: 13
5
- prerelease:
6
- segments:
7
- - 0
8
- - 3
9
- version: "0.3"
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.0
10
5
  platform: ruby
11
- authors:
12
- - Matt
6
+ authors:
7
+ - '@matstc'
13
8
  autorequire:
14
9
  bindir: bin
15
10
  cert_chain: []
16
-
17
- date: 2012-06-23 00:00:00 Z
18
- dependencies:
19
- - !ruby/object:Gem::Dependency
11
+ date: 2014-02-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
20
14
  name: colorize
21
- prerelease: false
22
- requirement: &id001 !ruby/object:Gem::Requirement
23
- none: false
24
- requirements:
25
- - - ">="
26
- - !ruby/object:Gem::Version
27
- hash: 3
28
- segments:
29
- - 0
30
- version: "0"
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
31
20
  type: :runtime
32
- version_requirements: *id001
33
- - !ruby/object:Gem::Dependency
34
- name: amatch
35
21
  prerelease: false
36
- requirement: &id002 !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - ">="
40
- - !ruby/object:Gem::Version
41
- hash: 3
42
- segments:
43
- - 0
44
- version: "0"
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: amatch
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
45
34
  type: :runtime
46
- version_requirements: *id002
47
- - !ruby/object:Gem::Dependency
48
- name: ruby-stemmer
49
35
  prerelease: false
50
- requirement: &id003 !ruby/object:Gem::Requirement
51
- none: false
52
- requirements:
53
- - - ">="
54
- - !ruby/object:Gem::Version
55
- hash: 3
56
- segments:
57
- - 0
58
- version: "0"
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: ruby-stemmer
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
59
48
  type: :runtime
60
- version_requirements: *id003
61
- - !ruby/object:Gem::Dependency
62
- name: uuid
63
49
  prerelease: false
64
- requirement: &id004 !ruby/object:Gem::Requirement
65
- none: false
66
- requirements:
67
- - - ">="
68
- - !ruby/object:Gem::Version
69
- hash: 3
70
- segments:
71
- - 0
72
- version: "0"
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: uuid
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
73
62
  type: :runtime
74
- version_requirements: *id004
75
- description:
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Utilities to help language learners
76
70
  email:
77
- executables:
78
- - cache-document
79
- - classify-new-words
80
- - find-examples-for
81
- - frequency-list
82
- - lookup
83
- - percentage-known-of
84
- - play-with-blanks
85
- - play-with-examples
86
- - prepare-text
71
+ executables:
87
72
  - proximity-of-words
73
+ - percentage-known-of
88
74
  - readability-of
89
- - vocabulary-coverage
75
+ - vocabulary-size
76
+ - classify-new-words
77
+ - play-with-examples
78
+ - cache-document
79
+ - frequency-list
80
+ - find-examples-for
90
81
  extensions: []
91
-
92
82
  extra_rdoc_files: []
93
-
94
- files:
95
- - lib/text-analysis-utils.rb
83
+ files:
84
+ - lib/game.rb
96
85
  - lib/document-cache.rb
86
+ - lib/tau_config.rb
87
+ - lib/text-analysis-utils.rb
97
88
  - lib/vocabulary-chest.rb
98
- - lib/game.rb
99
- - lib/lookup.rb
100
- - bin/cache-document
101
- - bin/classify-new-words
102
- - bin/find-examples-for
103
- - bin/frequency-list
104
- - bin/lookup
105
- - bin/percentage-known-of
106
- - bin/play-with-blanks
107
- - bin/play-with-examples
108
- - bin/prepare-text
109
89
  - bin/proximity-of-words
90
+ - bin/percentage-known-of
110
91
  - bin/readability-of
111
- - bin/vocabulary-coverage
92
+ - bin/vocabulary-size
93
+ - bin/classify-new-words
94
+ - bin/play-with-examples
95
+ - bin/cache-document
96
+ - bin/frequency-list
97
+ - bin/find-examples-for
112
98
  homepage: http://github.com/matstc/text-analysis-utils
113
- licenses: []
114
-
99
+ licenses:
100
+ - CC-BY-NC-SA 4.0
101
+ metadata: {}
115
102
  post_install_message:
116
103
  rdoc_options: []
117
-
118
- require_paths:
104
+ require_paths:
119
105
  - lib
120
- required_ruby_version: !ruby/object:Gem::Requirement
121
- none: false
122
- requirements:
123
- - - ">="
124
- - !ruby/object:Gem::Version
125
- hash: 57
126
- segments:
127
- - 1
128
- - 8
129
- - 7
130
- version: 1.8.7
131
- required_rubygems_version: !ruby/object:Gem::Requirement
132
- none: false
133
- requirements:
134
- - - ">="
135
- - !ruby/object:Gem::Version
136
- hash: 3
137
- segments:
138
- - 0
139
- version: "0"
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: 2.0.0
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
140
116
  requirements: []
141
-
142
117
  rubyforge_project:
143
- rubygems_version: 1.8.15
118
+ rubygems_version: 2.1.11
144
119
  signing_key:
145
- specification_version: 3
120
+ specification_version: 4
146
121
  summary: Utilities to help language learners
147
122
  test_files: []
148
-
123
+ has_rdoc:
data/bin/lookup DELETED
@@ -1,9 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.join(File.dirname(__FILE__), '../lib/lookup' )
4
-
5
- if !ARGV.empty?
6
- puts Lookup::go ARGV
7
- else
8
- puts Lookup::go STDIN.read.split("\n")
9
- end
data/bin/play-with-blanks DELETED
@@ -1,28 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require File.join(File.dirname(__FILE__), '../lib/game' )
4
-
5
- def get_input
6
- if !ARGV.empty?
7
- else
8
- STDIN.read
9
- end
10
- end
11
-
12
- (puts "Usage: #{$0} <file with words to practice> <file with examples>"; exit(1)) if ARGV.size < 2
13
-
14
- input = File.open(ARGV.shift){|f| f.read}
15
- words = input.split("\n").uniq
16
-
17
- example_sentences = []
18
- ARGV.each{|filename| example_sentences += File.open(filename).readlines}
19
- example_sentences.map!{|s| s.chomp}
20
- example_sentences.reject!{|s| words.find{|w| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
21
- words.reject!{|w| example_sentences.find{|s| (s =~ /\b#{Regexp.escape(w)}\b/i) != nil} == nil}
22
-
23
- puts "Playing with #{example_sentences.size} sentences and #{words.size} words."
24
-
25
- Game.new(words).play{|word|
26
- sentence = example_sentences.shuffle.find{|s| (s =~ /\b#{Regexp.escape(word)}\b/i) != nil}
27
- [sentence, $&]
28
- }
data/bin/prepare-text DELETED
@@ -1,9 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- text = ARGV.empty? ? STDIN.read : ARGV.map{|a|File.open(a){|f|f.read}}.join("\n\n")
4
- File.open("/tmp/prepared-text", 'w'){|f| f.write(text)}
5
- exec("classify-new-words /tmp/prepared-text && \
6
- cache-document /tmp/prepared-text && \
7
- echo '\nREADABILITY STATISTICS' && \
8
- readability-of /tmp/prepared-text && \
9
- percentage-known-of /tmp/prepared-text")
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- command =<<EOF
4
-
5
- total_occurrences=`expr $(frequency-list | awk '{print $1}' | xargs | sed 's/ / + /g')`
6
- unknown_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | xargs | sed 's/ / + /g')`
7
- next_500_occurrences=`expr $(frequency-list --unknown | awk '{print $1}' | head -500 | xargs | sed 's/ / + /g')`
8
-
9
- echo Total occurrences: $total_occurrences
10
- echo Unknown occurrences: $unknown_occurrences
11
- echo Your current vocabulary knowledge covers $(echo "scale=2;($total_occurrences - $unknown_occurrences) / $total_occurrences * 100" | bc -q)% of all occurrences
12
- echo The next 500 words will bring your cover to $(echo "scale=2;($total_occurrences - $unknown_occurrences + $next_500_occurrences) / $total_occurrences * 100" | bc -q)%
13
-
14
- EOF
15
-
16
- system command
data/lib/lookup.rb DELETED
@@ -1,15 +0,0 @@
1
- module Lookup
2
-
3
- def self.fetch_definition word
4
- definitions = `dict "#{word}" 2>/dev/null | grep ' ' | head -2`.chomp.gsub(" ","").split(/[\r\n]/)
5
- definitions.uniq.join(" -- ")
6
- end
7
-
8
- def self.sanitize word
9
- word.gsub(/[,\.]/,"")
10
- end
11
-
12
- def self.go words
13
- words.map{|w| sanitize w}.map{|w| "#{w}\t#{fetch_definition w}"}.join("\n")
14
- end
15
- end