lingo 1.9.0.pre1 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +18 -7
  3. data/README +6 -8
  4. data/Rakefile +5 -5
  5. data/dict/en/lingo-dic.txt +52625 -15693
  6. data/lang/en.lang +2 -2
  7. data/lib/lingo.rb +15 -3
  8. data/lib/lingo/array_utils.rb +39 -0
  9. data/lib/lingo/attendee.rb +1 -3
  10. data/lib/lingo/attendee/multi_worder.rb +4 -2
  11. data/lib/lingo/attendee/sequencer.rb +122 -73
  12. data/lib/lingo/attendee/text_writer.rb +4 -6
  13. data/lib/lingo/attendee/vector_filter.rb +5 -5
  14. data/lib/lingo/cli.rb +20 -2
  15. data/lib/lingo/config.rb +4 -3
  16. data/lib/lingo/ctl.rb +2 -20
  17. data/lib/lingo/ctl/analysis.rb +3 -5
  18. data/lib/lingo/ctl/files.rb +3 -3
  19. data/lib/lingo/database.rb +26 -25
  20. data/lib/lingo/database/crypter.rb +10 -6
  21. data/lib/lingo/database/source.rb +72 -25
  22. data/lib/lingo/database/source/key_value.rb +12 -8
  23. data/lib/lingo/database/source/multi_key.rb +11 -9
  24. data/lib/lingo/database/source/multi_value.rb +10 -8
  25. data/lib/lingo/database/source/single_word.rb +10 -6
  26. data/lib/lingo/database/source/word_class.rb +43 -14
  27. data/lib/lingo/debug.rb +2 -2
  28. data/lib/lingo/error.rb +21 -5
  29. data/lib/lingo/filter.rb +1 -1
  30. data/lib/lingo/language.rb +21 -21
  31. data/lib/lingo/language/grammar.rb +4 -2
  32. data/lib/lingo/language/lexical_hash.rb +2 -14
  33. data/lib/lingo/language/word.rb +1 -5
  34. data/lib/lingo/text_utils.rb +113 -20
  35. data/lib/lingo/version.rb +1 -1
  36. data/test/attendee/ts_sequencer.rb +286 -32
  37. data/test/attendee/ts_text_reader.rb +4 -4
  38. data/test/attendee/ts_text_writer.rb +19 -5
  39. data/test/test_helper.rb +2 -0
  40. data/test/ts_database.rb +213 -14
  41. metadata +36 -24
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -64,7 +64,8 @@ class Lingo
64
64
  opts.separator
65
65
 
66
66
  opts.option(:log__FILE, :L, 'Log file to print debug information to') { |log|
67
- options[:log] = stderr.reopen(log == '-' ? stdout : File.open(log, 'a+', encoding: ENC))
67
+ options[:log] = stderr.reopen(
68
+ log == '-' ? stdout : File.open(log, 'a+', encoding: ENCODING))
68
69
  }
69
70
 
70
71
  opts.separator
@@ -76,4 +77,21 @@ class Lingo
76
77
 
77
78
  end
78
79
 
80
+ def self.CLI(args, extra = nil, &block)
81
+ opt, req = args.partition { |arg| arg.sub!(/\?\z/, '') }
82
+
83
+ unless (n = ARGV.size - req.size) >= 0 && n <= opt.size
84
+ msg = "Usage: #{$0}#{args.map { |arg| [' ', arg].zip(
85
+ opt.include?(arg) ? %w[[ ]] : %w[< >]).join }.join}"
86
+
87
+ abort Array(extra).unshift(msg).join("\n\n")
88
+ end
89
+
90
+ Object.new.extend(TextUtils).instance_eval(&block)
91
+ rescue LingoError => err
92
+ abort err.to_s
93
+ ensure
94
+ ObjectSpace.each_object(Zlib::GzipWriter, &:close)
95
+ end
96
+
79
97
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -53,7 +53,7 @@ class Lingo
53
53
  end
54
54
  end
55
55
 
56
- attr_reader :language_file, :config_file
56
+ attr_reader :lang_file, :config_file
57
57
 
58
58
  def to_h
59
59
  { 'version' => VERSION }.merge(@opts)
@@ -128,7 +128,8 @@ class Lingo
128
128
  file = Lingo.find(type, @opts[key]) { quit }
129
129
  instance_variable_set("@#{type}_file", file)
130
130
 
131
- File.open(file, encoding: ENC) { |f| @opts.update(SafeYAML.load(f)) }
131
+ File.open(file, encoding: ENCODING) { |f|
132
+ @opts.update(SafeYAML.load(f)) }
132
133
  rescue Psych::SyntaxError => err
133
134
  err.message << " (in #{file})"
134
135
  raise
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -31,6 +31,7 @@ class Lingo
31
31
  module Ctl
32
32
 
33
33
  extend self
34
+ extend TextUtils
34
35
 
35
36
  PROG, VERSION, OPTWIDTH = $0, '0.0.3', 21
36
37
  PROGNAME, OPTIONS = File.basename(PROG), {}
@@ -120,25 +121,6 @@ Usage: #{PROG} <command> [arguments] [options]
120
121
  usage('Too many arguments.') unless ARGV.empty?
121
122
  end
122
123
 
123
- def overwrite?(target, unlink = false)
124
- !File.exist?(target) || if agree?("#{target} already exists. Overwrite?")
125
- File.unlink(target) if unlink
126
- true
127
- end
128
- end
129
-
130
- def agree?(msg)
131
- print "#{msg} (y/n) [n]: "
132
-
133
- case answer = $stdin.gets.chomp
134
- when /\Ano?\z/i, '' then nil
135
- when /\Ay(?:es)?\z/i then true
136
- else puts 'Please enter "yes" or "no".'; agree?(msg)
137
- end
138
- rescue Interrupt
139
- abort ''
140
- end
141
-
142
124
  end
143
125
 
144
126
  def self.ctl
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -24,8 +24,6 @@
24
24
  ###############################################################################
25
25
  #++
26
26
 
27
- require 'csv'
28
-
29
27
  class Lingo
30
28
 
31
29
  module Ctl
@@ -97,11 +95,11 @@ class Lingo
97
95
  File.basename(path.chomp(File.extname(path))) }.uniq.join('-'))
98
96
 
99
97
  lambda { |key, &block| overwrite?(file = "#{name}.#{key}.csv") &&
100
- puts("#{file}: #{Array(CSV.open(file, 'wb', &block)).join(' / ')}") }
98
+ puts("#{file}: #{Array(open_csv(file, 'wb', &block)).join(' / ')}") }
101
99
  end
102
100
 
103
101
  def csv_foreach(paths)
104
- paths.each { |path| CSV.foreach(path, headers: true) { |row|
102
+ paths.each { |path| foreach_csv(path, headers: true) { |row|
105
103
  yield path, *row.values_at(*%w[string token word pattern]) } }
106
104
  end
107
105
 
@@ -76,9 +76,9 @@ class Lingo
76
76
  FileUtils.cp(source, target, verbose: true)
77
77
  end
78
78
 
79
- def do_clearstore
80
- store = Dir["#{find(:store, false)}.*"]
81
- FileUtils.rm(store, verbose: true) unless store.empty?
79
+ def clear(what)
80
+ target = Dir["#{find(what, false)}.*"]
81
+ FileUtils.rm(target, verbose: true) unless target.empty?
82
82
  end
83
83
 
84
84
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -45,6 +45,8 @@ class Lingo
45
45
  KEY_REF = '*'
46
46
  SYS_KEY = '~'
47
47
 
48
+ KEY_REF_RE = %r{\A#{Regexp.escape(KEY_REF)}(\d+)\z}o
49
+
48
50
  BACKENDS = []
49
51
  BACKEND_BY_EXT = {}
50
52
 
@@ -58,6 +60,25 @@ class Lingo
58
60
  klass.class_eval('def store_ext; EXT; end', __FILE__, __LINE__) if meth
59
61
  end
60
62
 
63
+ def backend_by_ext(file, ext = File.extname(file))
64
+ get_backend(BACKEND_BY_EXT[ext], file) or
65
+ raise BackendNotFoundError.new(file)
66
+ end
67
+
68
+ def find_backend(env = 'LINGO_BACKEND')
69
+ env && get_backend(ENV[env]) || BACKENDS.find { |name|
70
+ backend = get_backend(name, nil, true) and return backend }
71
+ end
72
+
73
+ def get_backend(name, file = nil, relax = false)
74
+ return unless name
75
+
76
+ Object.const_get(name)
77
+ const_get("#{name}Store")
78
+ rescue TypeError, NameError => err
79
+ raise BackendNotAvailableError.new(name, file, err) unless relax
80
+ end
81
+
61
82
  def open(*args, &block)
62
83
  new(*args).open(&block)
63
84
  end
@@ -75,23 +96,12 @@ class Lingo
75
96
  FileUtils.mkdir_p(File.dirname(@stofile))
76
97
  rescue SourceFileNotFoundError => err
77
98
  @stofile = skip_ext = err.id
78
-
79
- unless err.name
80
- if name = BACKEND_BY_EXT[File.extname(@stofile)]
81
- backend = get_backend(name, @stofile)
82
- else
83
- raise BackendNotFoundError.new(@stofile)
84
- end
85
- end
99
+ backend = self.class.backend_by_ext(@stofile) unless err.name
86
100
  rescue NoWritableStoreError
87
101
  backend = HashStore
88
102
  end
89
103
 
90
- unless backend ||= get_backend(ENV['LINGO_BACKEND'])
91
- BACKENDS.find { |name| backend = get_backend(name, nil, true) }
92
- end
93
-
94
- extend(@backend = backend || HashStore)
104
+ extend(@backend = backend || self.class.find_backend || HashStore)
95
105
 
96
106
  @stofile << store_ext unless skip_ext || !respond_to?(:store_ext)
97
107
 
@@ -151,15 +161,6 @@ class Lingo
151
161
 
152
162
  private
153
163
 
154
- def get_backend(name, file = nil, relax = false)
155
- return unless name
156
-
157
- Object.const_get(name)
158
- self.class.const_get("#{name}Store")
159
- rescue TypeError, NameError => err
160
- raise BackendNotAvailableError.new(name, file, err) unless relax
161
- end
162
-
163
164
  def config_hash
164
165
  hashes = [config]
165
166
 
@@ -228,11 +229,11 @@ class Lingo
228
229
  end
229
230
 
230
231
  def _encode!(str)
231
- str.force_encoding(ENC)
232
+ str.force_encoding(ENCODING)
232
233
  end
233
234
 
234
235
  def convert(verbose = lingo.config.stderr.tty?)
235
- src = Source.get(config.fetch('txt-format', 'key_value'), @id, lingo)
236
+ src = Source.from_config(config, @id)
236
237
 
237
238
  sep, hyphenate, key_map, val_map = prepare_lex
238
239
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -35,23 +35,27 @@ class Lingo
35
35
 
36
36
  extend self
37
37
 
38
+ KEYLEN = 16
39
+
40
+ CIPHER = 'AES-128-CBC'.freeze
41
+
38
42
  def digest(key)
39
43
  Digest::SHA1.hexdigest(key)
40
44
  end
41
45
 
42
46
  def encode(key, val)
43
- [digest(key), crypt(:encrypt, key, val)]
47
+ [digest = digest(key), crypt(:encrypt, key, val, digest)]
44
48
  end
45
49
 
46
50
  def decode(key, val)
47
- crypt(:decrypt, key, val).force_encoding(ENC)
51
+ crypt(:decrypt, key, val, digest(key)).force_encoding(ENCODING)
48
52
  end
49
53
 
50
54
  private
51
55
 
52
- def crypt(method, key, val)
53
- cipher = OpenSSL::Cipher.new('aes-128-cbc').send(method)
54
- cipher.iv = cipher.key = digest(key)
56
+ def crypt(method, key, val, digest)
57
+ cipher = OpenSSL::Cipher.new(CIPHER).send(method)
58
+ cipher.iv = cipher.key = digest[0, KEYLEN]
55
59
  cipher.update(val) + cipher.final
56
60
  end
57
61
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -49,30 +49,57 @@ class Lingo
49
49
 
50
50
  class Source
51
51
 
52
- def self.get(name, *args)
53
- Lingo.get_const(name, self).new(*args)
52
+ LEXICAL_SEPARATOR = '#'.freeze
53
+
54
+ DEFAULT_SEPARATOR = nil
55
+
56
+ DEFAULT_DEF_WC = nil
57
+
58
+ MAX_LENGTH = 4096
59
+
60
+ class << self
61
+
62
+ def from_id(id, lingo)
63
+ from_config(lingo.database_config(id), id)
64
+ end
65
+
66
+ def from_config(config, id = nil)
67
+ format = config.fetch('txt-format', 'key_value')
68
+ Lingo.get_const(format, self).new(config['name'], config, id)
69
+ end
70
+
71
+ def lexicals(val, sep = LEXICAL_SEPARATOR, ref = KEY_REF_RE)
72
+ val.map { |str|
73
+ str =~ ref ? $1.to_i : begin
74
+ k, *w = str.split(sep)
75
+ Language::Lexical.new(k.strip, w)
76
+ end
77
+ }.uniq if val
78
+ end
79
+
54
80
  end
55
81
 
56
82
  attr_reader :pos
57
83
 
58
- def initialize(id, lingo, def_wc_default = nil)
59
- @config = lingo.database_config(id)
84
+ def initialize(name = nil, config = {}, id = nil)
85
+ @config = config
60
86
 
61
- source_file = Lingo.find(:dict, name = @config['name'], relax: true)
87
+ src_file = Lingo.find(:dict, name, relax: true) if name
62
88
 
63
- reject_file = begin
64
- Lingo.find(:store, source_file) << '.rev'
89
+ rej_file = begin
90
+ Lingo.find(:store, src_file) << '.rev'
65
91
  rescue NoWritableStoreError, SourceFileNotFoundError
66
- end
92
+ end if id && src_file
67
93
 
68
- @src = Pathname.new(source_file)
69
- @rej = Pathname.new(reject_file) if reject_file
94
+ @src = Pathname.new(src_file) if src_file
95
+ @rej = Pathname.new(rej_file) if rej_file
70
96
 
71
- raise SourceFileNotFoundError.new(name, id) unless @src.exist?
97
+ raise id ? SourceFileNotFoundError.new(name, id) :
98
+ FileNotFoundError.new(name) if name && !@src.exist?
72
99
 
73
- @def = @config.fetch('def-wc', def_wc_default)
100
+ @sep = config.fetch('separator', self.class::DEFAULT_SEPARATOR)
101
+ @def = config.fetch('def-wc', self.class::DEFAULT_DEF_WC)
74
102
  @def = @def.downcase if @def
75
- @sep = @config['separator']
76
103
 
77
104
  @wrd = "(?:#{Language::Char::ANY})+"
78
105
  @pat = /^#{@wrd}$/
@@ -85,33 +112,47 @@ class Lingo
85
112
  end
86
113
 
87
114
  def each
88
- reject_file = @rej.open('w', encoding: ENC) if @rej
115
+ return enum_for(__method__) unless block_given?
116
+ each_line { |line, key, val| yield parse_line(line, key, val) }
117
+ end
118
+
119
+ def each_line
120
+ return enum_for(__method__) unless block_given?
121
+
122
+ rej_file = @rej.open('w', encoding: ENCODING) if @rej
89
123
 
90
- @src.each_line($/, encoding: ENC) { |line|
124
+ @src.each_line($/, encoding: ENCODING) { |line|
91
125
  @pos += length = line.bytesize
92
126
 
93
127
  line.strip!
94
128
  next if line.empty? || line.start_with?('#')
95
129
 
96
- line.chomp!
97
- line.replace(Unicode.downcase(line))
98
-
99
- if length < 4096 && line =~ @pat
100
- yield convert_line(line, $1, $2)
130
+ if length < MAX_LENGTH && line.replace(Unicode.downcase(line)) =~ @pat
131
+ yield line, $1, $2
101
132
  else
102
133
  @rej_cnt += 1
103
- reject_file.puts(line) if reject_file
134
+ rej_file.puts(line) if rej_file
104
135
  end
105
136
  }
106
137
 
107
138
  self
108
139
  ensure
109
- if reject_file
110
- reject_file.close
111
- @rej.delete if @rej.size == 0
140
+ if rej_file
141
+ rej_file.close
142
+ @rej.delete if @rej.size.zero?
112
143
  end
113
144
  end
114
145
 
146
+ def each_lexical
147
+ return enum_for(__method__) unless block_given?
148
+ each { |key, val| yield key, self.class.lexicals(val) }
149
+ end
150
+
151
+ def each_dump(*args)
152
+ return enum_for(__method__, *args) unless block_given?
153
+ each_lexical { |key, val| yield dump_line(key, val, *args) }
154
+ end
155
+
115
156
  def set(db, key, val)
116
157
  db[key] = val
117
158
  end
@@ -120,6 +161,12 @@ class Lingo
120
161
  [@rej_cnt, @rej]
121
162
  end
122
163
 
164
+ private
165
+
166
+ def lexical(*args)
167
+ args.join(LEXICAL_SEPARATOR)
168
+ end
169
+
123
170
  end
124
171
 
125
172
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -39,17 +39,21 @@ class Lingo
39
39
 
40
40
  class KeyValue < self
41
41
 
42
- DEFAULT_SEPARATOR = '*'
42
+ DEFAULT_SEPARATOR = '*'.freeze
43
43
 
44
- def initialize(id, lingo)
45
- super(id, lingo, Language::LA_UNKNOWN)
46
- @pat = /^(#{@wrd})#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}(#{@wrd})$/
44
+ DEFAULT_DEF_WC = Language::LA_UNKNOWN
45
+
46
+ def initialize(*)
47
+ super
48
+ @pat = /^(#{@wrd})#{Regexp.escape(@sep)}(#{@wrd})$/
47
49
  end
48
50
 
49
- private
51
+ def parse_line(line, key, val)
52
+ [key.strip, [lexical(val.strip, @def)]]
53
+ end
50
54
 
51
- def convert_line(line, key, val)
52
- [key.strip, %W[#{val.strip}##{@def}]]
55
+ def dump_line(key, val, sep = @sep, *)
56
+ val.map(&:form).unshift(key).join(sep)
53
57
  end
54
58
 
55
59
  end